From d4819632e2d0e135cff3e1987b77ca584e22a41a Mon Sep 17 00:00:00 2001 From: "Bruce A. MacNaughton" Date: Tue, 19 Jul 2022 17:35:19 -0700 Subject: [PATCH 1/4] working updates --- .../src/cascading_map.rs | 90 +++++++++++++++++++ src/tools/unicode-table-generator/src/main.rs | 10 ++- .../src/raw_emitter.rs | 10 +++ 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 src/tools/unicode-table-generator/src/cascading_map.rs diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs new file mode 100644 index 0000000000000..780d32e969c6d --- /dev/null +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -0,0 +1,90 @@ +use crate::fmt_list; +use crate::raw_emitter::RawEmitter; +use std::collections::HashMap; +use std::fmt::Write as _; +use std::ops::Range; + + +impl RawEmitter { + pub fn emit_cascading_map(&mut self, ranges: &[Range]) -> bool { + + let mut map: [u8; 256] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + + let points = ranges.iter().flat_map( + |r| (r.start..r.end).into_iter().collect::>() + ).collect::>(); + + println!("there are {} points", points.len()); + + // how many distinct ranges need to be counted? + let mut codepoints_by_high_bytes = HashMap::>::new(); + for point in points { + // assert that there is no whitespace over the 0x3000 range. + assert!(point <= 0x3000, "the highest unicode whitespace value has changed"); + let high_bytes = point as usize >> 8; + let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new); + codepoints.push(point); + } + + let mut bit_for_high_byte = 1u8; + let mut arms = Vec::::new(); + + let mut high_bytes: Vec = codepoints_by_high_bytes.keys().map(|k| k.clone()).collect(); + high_bytes.sort(); + for high_byte in high_bytes { + let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); + if codepoints.len() == 1 { + let ch = codepoints.pop().unwrap(); + arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch)); + continue; + } + // more than 1 codepoint in this arm + for codepoint in codepoints { + map[(*codepoint & 0xff) as usize] |= bit_for_high_byte; + } + arms.push(format!( + "{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0", + high_byte, + bit_for_high_byte) + ); + bit_for_high_byte <<= 1; + } + + writeln!( + &mut self.file, + "static WHITESPACE_MAP: [u8; 256] = [{}];", + fmt_list(map.iter()) + ) + .unwrap(); + self.bytes_used += 256; + + + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); + for arm in arms { + writeln!(&mut self.file, " {},", arm).unwrap(); + } + writeln!(&mut self.file, " _ => false,").unwrap(); + writeln!(&mut self.file, " }}").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + + true + } +} diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 4720ee7020f89..b40afcf4bf2d4 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -78,9 +78,10 @@ use ucd_parse::Codepoints; mod case_mapping; mod raw_emitter; mod skiplist; +mod cascading_map; mod unicode_download; -use raw_emitter::{emit_codepoints, RawEmitter}; +use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter}; static PROPERTIES: &[&str] = &[ "Alphabetic", @@ -241,8 +242,13 @@ fn main() { let mut modules = Vec::new(); for (property, ranges) in ranges_by_property { let datapoints = ranges.iter().map(|r| r.end - r.start).sum::(); + let mut emitter = RawEmitter::new(); - emit_codepoints(&mut emitter, &ranges); + if property == &"White_Space" { + emit_whitespace(&mut emitter, &ranges); + } else { + emit_codepoints(&mut emitter, &ranges); + } modules.push((property.to_lowercase().to_string(), emitter.file)); println!( diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index ab8eaee9541a2..f5960c459200d 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -170,6 +170,16 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { } } +pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range]) { + emitter.blank_line(); + + let mut cascading = emitter.clone(); + cascading.emit_cascading_map(&ranges); + *emitter = cascading; + emitter.desc = String::from("cascading"); + +} + struct Canonicalized { canonical_words: Vec, canonicalized_words: Vec<(u8, u8)>, From 89ace470dcc0d4e08c1fd530e938893f87dcd436 Mon Sep 17 00:00:00 2001 From: "Bruce A. MacNaughton" Date: Tue, 19 Jul 2022 18:03:18 -0700 Subject: [PATCH 2/4] formatted --- .../src/cascading_map.rs | 51 +++++++------------ src/tools/unicode-table-generator/src/main.rs | 2 +- .../src/raw_emitter.rs | 1 - 3 files changed, 20 insertions(+), 34 deletions(-) diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs index 780d32e969c6d..d4efdef2e806c 100644 --- a/src/tools/unicode-table-generator/src/cascading_map.rs +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -4,32 +4,24 @@ use std::collections::HashMap; use std::fmt::Write as _; use std::ops::Range; - impl RawEmitter { pub fn emit_cascading_map(&mut self, ranges: &[Range]) -> bool { - let mut map: [u8; 256] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; - let points = ranges.iter().flat_map( - |r| (r.start..r.end).into_iter().collect::>() - ).collect::>(); + let points = ranges + .iter() + .flat_map(|r| (r.start..r.end).into_iter().collect::>()) + .collect::>(); println!("there are {} points", points.len()); @@ -46,7 +38,8 @@ impl RawEmitter { let mut bit_for_high_byte = 1u8; let mut arms = Vec::::new(); - let mut high_bytes: Vec = codepoints_by_high_bytes.keys().map(|k| k.clone()).collect(); + let mut high_bytes: Vec = + codepoints_by_high_bytes.keys().map(|k| k.clone()).collect(); high_bytes.sort(); for high_byte in high_bytes { let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); @@ -61,21 +54,15 @@ impl RawEmitter { } arms.push(format!( "{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0", - high_byte, - bit_for_high_byte) - ); + high_byte, bit_for_high_byte + )); bit_for_high_byte <<= 1; } - writeln!( - &mut self.file, - "static WHITESPACE_MAP: [u8; 256] = [{}];", - fmt_list(map.iter()) - ) - .unwrap(); + writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter())) + .unwrap(); self.bytes_used += 256; - writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); for arm in arms { diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index b40afcf4bf2d4..a3327a3c2ffd9 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -75,10 +75,10 @@ use std::collections::{BTreeMap, HashMap}; use std::ops::Range; use ucd_parse::Codepoints; +mod cascading_map; mod case_mapping; mod raw_emitter; mod skiplist; -mod cascading_map; mod unicode_download; use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter}; diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index f5960c459200d..5aca86ba089d6 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -177,7 +177,6 @@ pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range]) { cascading.emit_cascading_map(&ranges); *emitter = cascading; emitter.desc = String::from("cascading"); - } struct Canonicalized { From e5d4de39128205eb322c1e38219c0969c9af625b Mon Sep 17 00:00:00 2001 From: "Bruce A. MacNaughton" Date: Tue, 19 Jul 2022 18:03:33 -0700 Subject: [PATCH 3/4] generated code --- library/core/src/unicode/unicode_data.rs | 27 +++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index d2073f86c010f..eb9334bdc6dc1 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -544,18 +544,25 @@ pub mod uppercase { #[rustfmt::skip] pub mod white_space { - static SHORT_OFFSET_RUNS: [u32; 4] = [ - 5760, 18882560, 23080960, 40972289, - ]; - static OFFSETS: [u8; 21] = [ - 9, 5, 18, 1, 100, 1, 26, 1, 0, 1, 0, 11, 29, 2, 5, 1, 47, 1, 0, 1, 0, + static WHITESPACE_MAP: [u8; 256] = [ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; pub fn lookup(c: char) -> bool { - super::skip_search( - c as u32, - &SHORT_OFFSET_RUNS, - &OFFSETS, - ) + match c as u32 >> 8 { + 0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0, + 22 => c as u32 == 0x1680, + 32 => WHITESPACE_MAP[c as usize & 0xff] & 2 != 0, + 48 => c as u32 == 0x3000, + _ => false, + } } } From 5d048eb69dc73aa6307f07ad6a21eee5e3e64c9f Mon Sep 17 00:00:00 2001 From: "Bruce A. MacNaughton" Date: Wed, 20 Jul 2022 16:13:54 -0700 Subject: [PATCH 4/4] add #inline --- library/core/src/unicode/unicode_data.rs | 1 + src/tools/unicode-table-generator/src/cascading_map.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index eb9334bdc6dc1..c1eff3a36e6e1 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -555,6 +555,7 @@ pub mod white_space { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { match c as u32 >> 8 { 0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0, diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs index d4efdef2e806c..02c7542309a45 100644 --- a/src/tools/unicode-table-generator/src/cascading_map.rs +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -63,6 +63,7 @@ impl RawEmitter { .unwrap(); self.bytes_used += 256; + writeln!(&mut self.file, "#[inline]").unwrap(); writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); for arm in arms {