rust-lang · bors · Mar 28, 2020 · Mar 19, 2020 · Mar 19, 2020 · Mar 19, 2020
diff --git a/src/libcore/unicode/mod.rs b/src/libcore/unicode/mod.rs
@@ -32,28 +32,3 @@ pub use unicode_data::lowercase::lookup as Lowercase;
 pub use unicode_data::n::lookup as N;
 pub use unicode_data::uppercase::lookup as Uppercase;
 pub use unicode_data::white_space::lookup as White_Space;
-
-#[inline(always)]
-fn range_search<const N: usize, const N1: usize, const N2: usize>(
-    needle: u32,
-    chunk_idx_map: &[u8; N],
-    (last_chunk_idx, last_chunk_mapping): (u16, u8),
-    bitset_chunk_idx: &[[u8; 16]; N1],
-    bitset: &[u64; N2],
-) -> bool {
-    let bucket_idx = (needle / 64) as usize;
-    let chunk_map_idx = bucket_idx / 16;
-    let chunk_piece = bucket_idx % 16;
-    let chunk_idx = if chunk_map_idx >= N {
-        if chunk_map_idx == last_chunk_idx as usize {
-            last_chunk_mapping
-        } else {
-            return false;
-        }
-    } else {
-        chunk_idx_map[chunk_map_idx]
-    };
-    let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
-    let word = bitset[(idx as usize)];
-    (word & (1 << (needle % 64) as u64)) != 0
-}
diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs
@@ -1,9 +1,83 @@
+//! This implements the core logic of the compression scheme used to compactly
+//! encode Unicode properties.
+//!
+//! We have two primary goals with the encoding: we want to be compact, because
+//! these tables often end up in ~every Rust program (especially the
+//! grapheme_extend table, used for str debugging), including those for embedded
+//! targets (where space is important). We also want to be relatively fast,
+//! though this is more of a nice to have rather than a key design constraint.
+//! It is expected that libraries/applications which are performance-sensitive
+//! to Unicode property lookups are extremely rare, and those that care may find
+//! the tradeoff of the raw bitsets worth it. For most applications, a
+//! relatively fast but much smaller (and as such less cache-impacting, etc.)
+//! data set is likely preferable.
+//!
+//! We have two separate encoding schemes: a skiplist-like approach, and a
+//! compressed bitset. The datasets we consider mostly use the skiplist (it's
+//! smaller) but the lowercase and uppercase sets are sufficiently sparse for
+//! the bitset to be worthwhile -- for those sets the biset is a 2x size win.
+//! Since the bitset is also faster, this seems an obvious choice. (As a
+//! historical note, the bitset was also the prior implementation, so its
+//! relative complexity had already been paid).
+//!
+//! ## The bitset
+//!
+//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
+//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
+//! over 17 kilobytes of data per character set -- way too much for our
+//! purposes.
+//!
+//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
+//! not skipping the small 'gap') is associated into words (u64) and
+//! deduplicated. On random data, this would be useless; on our data, this is
+//! incredibly beneficial -- our data sets have (far) less than 256 unique
+//! words.
+//!
+//! This gives us an array that maps `u8 -> word`; the current algorithm does
+//! not handle the case of more than 256 unique words, but we are relatively far
+//! from coming that close.
+//!
+//! With that scheme, we now have a single byte for every 64 codepoints.
+//!
+//! We further chunk these by some constant N (between 1 and 64 per group,
+//! dynamically chosen for smallest size), and again deduplicate and store in an
+//! array (u8 -> [u8; N]).
+//!
+//! The bytes of this array map into the words from the bitset above, but we
+//! apply another trick here: some of these words are similar enough that they
+//! can be represented by some function of another word. The particular
+//! functions chosen are rotation, inversion, and shifting (right).
+//!
+//! ## The skiplist
+//!
+//! The skip list arose out of the desire for an even smaller encoding than the
+//! bitset -- and was the answer to the question "what is the smallest
+//! representation we can imagine?". However, it is not necessarily the
+//! smallest, and if you have a better proposal, please do suggest it!
+//!
+//! This is a relatively straightforward encoding. First, we break up all the
+//! ranges in the input data into offsets from each other, essentially a gap
+//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
+//! store those directly. We make use of the larger gaps (which are nicely
+//! interspersed already) throughout the dataset to index this data set.
+//!
+//! In particular, each run of small gaps (terminating in a large gap) is
+//! indexed in a separate dataset. That data set stores an index into the
+//! primary offset list and a prefix sum of that offset list. These are packed
+//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
+//!
+//! Lookup proceeds via a binary search in the index and then a straightforward
+//! linear scan (adding up the offsets) until we reach the needle, and then the
+//! index of that offset is utilized as the answer to whether we're in the set
+//! or not.
+
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Range;
 use ucd_parse::Codepoints;
 
 mod case_mapping;
 mod raw_emitter;
+mod skiplist;
 mod unicode_download;
 
 use raw_emitter::{emit_codepoints, RawEmitter};
@@ -152,9 +226,17 @@ fn main() {
         std::process::exit(1);
     });
 
+    // Optional test path, which is a Rust source file testing that the unicode
+    // property lookups are correct.
+    let test_path = std::env::args().nth(2);
+
     let unicode_data = load_data();
     let ranges_by_property = &unicode_data.ranges;
 
+    if let Some(path) = test_path {
+        std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
+    }
+
     let mut total_bytes = 0;
     let mut modules = Vec::new();
     for (property, ranges) in ranges_by_property {
@@ -163,7 +245,16 @@ fn main() {
         emit_codepoints(&mut emitter, &ranges);
 
         modules.push((property.to_lowercase().to_string(), emitter.file));
-        println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
+        println!(
+            "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
+            property,
+            emitter.bytes_used,
+            datapoints,
+            ranges.len(),
+            ranges.first().unwrap().start,
+            ranges.last().unwrap().end,
+            emitter.desc,
+        );
         total_bytes += emitter.bytes_used;
     }
 
@@ -173,7 +264,10 @@ fn main() {
         "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
     );
 
-    table_file.push_str("use super::range_search;\n\n");
+    // Include the range search function
+    table_file.push('\n');
+    table_file.push_str(include_str!("range_search.rs"));
+    table_file.push('\n');
 
     table_file.push_str(&version());
 
@@ -236,26 +330,110 @@ fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
     out
 }
 
+fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
+    let mut s = String::new();
+    s.push_str("#![allow(incomplete_features, unused)]\n");
+    s.push_str("#![feature(const_generics)]\n\n");
+    s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
+    s.push_str(&format!("#[path = \"{}\"]\n", data_path));
+    s.push_str("mod unicode_data;\n\n");
+
+    s.push_str("\nfn main() {\n");
+
+    for (property, ranges) in ranges {
+        s.push_str(&format!(r#"    println!("Testing {}");"#, property));
+        s.push('\n');
+        s.push_str(&format!("    {}_true();\n", property.to_lowercase()));
+        s.push_str(&format!("    {}_false();\n", property.to_lowercase()));
+        let mut is_true = Vec::new();
+        let mut is_false = Vec::new();
+        for ch_num in 0..(std::char::MAX as u32) {
+            if std::char::from_u32(ch_num).is_none() {
+                continue;
+            }
+            if ranges.iter().any(|r| r.contains(&ch_num)) {
+                is_true.push(ch_num);
+            } else {
+                is_false.push(ch_num);
+            }
+        }
+
+        s.push_str(&format!("    fn {}_true() {{\n", property.to_lowercase()));
+        generate_asserts(&mut s, property, &is_true, true);
+        s.push_str("    }\n\n");
+        s.push_str(&format!("    fn {}_false() {{\n", property.to_lowercase()));
+        generate_asserts(&mut s, property, &is_false, false);
+        s.push_str("    }\n\n");
+    }
+
+    s.push_str("}");
+    s
+}
+
+fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
+    for range in ranges_from_set(points) {
+        if range.end == range.start + 1 {
+            s.push_str(&format!(
+                "        assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
+                if truthy { "" } else { "!" },
+                property.to_lowercase(),
+                std::char::from_u32(range.start).unwrap(),
+                range.start,
+            ));
+        } else {
+            s.push_str(&format!("        for chn in {:?}u32 {{\n", range));
+            s.push_str(&format!(
+                "            assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
+                if truthy { "" } else { "!" },
+                property.to_lowercase(),
+            ));
+            s.push_str("        }\n");
+        }
+    }
+}
+
+fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
+    let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
+    merge_ranges(&mut ranges);
+    ranges
+}
+
 fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
     loop {
         let mut new_ranges = Vec::new();
         let mut idx_iter = 0..(ranges.len() - 1);
+        let mut should_insert_last = true;
         while let Some(idx) = idx_iter.next() {
             let cur = ranges[idx].clone();
             let next = ranges[idx + 1].clone();
             if cur.end == next.start {
-                let _ = idx_iter.next(); // skip next as we're merging it in
+                if idx_iter.next().is_none() {
+                    // We're merging the last element
+                    should_insert_last = false;
+                }
                 new_ranges.push(cur.start..next.end);
             } else {
+                // We're *not* merging the last element
+                should_insert_last = true;
                 new_ranges.push(cur);
             }
         }
-        new_ranges.push(ranges.last().unwrap().clone());
+        if should_insert_last {
+            new_ranges.push(ranges.last().unwrap().clone());
+        }
         if new_ranges.len() == ranges.len() {
             *ranges = new_ranges;
             break;
         } else {
             *ranges = new_ranges;
         }
     }
+
+    let mut last_end = None;
+    for range in ranges {
+        if let Some(last) = last_end {
+            assert!(range.start > last, "{:?}", range);
+        }
+        last_end = Some(range.end);
+    }
 }
diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs
@@ -0,0 +1,93 @@
+#[inline(always)]
+fn bitset_search<
+    const N: usize,
+    const CHUNK_SIZE: usize,
+    const N1: usize,
+    const CANONICAL: usize,
+    const CANONICALIZED: usize,
+>(
+    needle: u32,
+    chunk_idx_map: &[u8; N],
+    bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
+    bitset_canonical: &[u64; CANONICAL],
+    bitset_canonicalized: &[(u8, u8); CANONICALIZED],
+) -> bool {
+    let bucket_idx = (needle / 64) as usize;
+    let chunk_map_idx = bucket_idx / CHUNK_SIZE;
+    let chunk_piece = bucket_idx % CHUNK_SIZE;
+    let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
+        v
+    } else {
+        return false;
+    };
+    let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize;
+    let word = if let Some(word) = bitset_canonical.get(idx) {
+        *word
+    } else {
+        let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()];
+        let mut word = bitset_canonical[real_idx as usize];
+        let should_invert = mapping & (1 << 6) != 0;
+        if should_invert {
+            word = !word;
+        }
+        // Lower 6 bits
+        let quantity = mapping & ((1 << 6) - 1);
+        if mapping & (1 << 7) != 0 {
+            // shift
+            word >>= quantity as u64;
+        } else {
+            word = word.rotate_left(quantity as u32);
+        }
+        word
+    };
+    (word & (1 << (needle % 64) as u64)) != 0
+}
+
+fn decode_prefix_sum(short_offset_run_header: u32) -> u32 {
+    short_offset_run_header & ((1 << 21) - 1)
+}
+
+fn decode_length(short_offset_run_header: u32) -> usize {
+    (short_offset_run_header >> 21) as usize
+}
+
+#[inline(always)]
+fn skip_search<const SOR: usize, const OFFSETS: usize>(
+    needle: u32,
+    short_offset_runs: &[u32; SOR],
+    offsets: &[u8; OFFSETS],
+) -> bool {
+    // Note that this *cannot* be past the end of the array, as the last
+    // element is greater than std::char::MAX (the largest possible needle).
+    //
+    // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct
+    // location cannot be past it, so Err(idx) != length either.
+    //
+    // This means that we can avoid bounds checking for the accesses below, too.
+    let last_idx =
+        match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) {
+            Ok(idx) => idx + 1,
+            Err(idx) => idx,
+        };
+
+    let mut offset_idx = decode_length(short_offset_runs[last_idx]);
+    let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
+        decode_length(*next) - offset_idx
+    } else {
+        offsets.len() - offset_idx
+    };
+    let prev =
+        last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0);
+
+    let total = needle - prev;
+    let mut prefix_sum = 0;
+    for _ in 0..(length - 1) {
+        let offset = offsets[offset_idx];
+        prefix_sum += offset as u32;
+        if prefix_sum > total {
+            break;
+        }
+        offset_idx += 1;
+    }
+    offset_idx % 2 == 1
+}