From 002558f562d61f5b87ffc646616da1415d3eb360 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sat, 11 Oct 2025 01:31:25 +0100 Subject: [PATCH 1/4] refactor: move runtime functions to core Instead of `include_str!()`ing `range_search.rs`, just make it a normal module under `core::unicode`. This means the same source code doesn't have to be checked in twice, and it plays nicer with IDEs. Also rename it to `rt` since it includes functions for searching the bitsets as well as the range represesentation. --- library/core/src/unicode/mod.rs | 1 + .../core/src/unicode/rt.rs | 14 +- library/core/src/unicode/unicode_data.rs | 130 +----------------- src/tools/unicode-table-generator/src/main.rs | 7 +- 4 files changed, 12 insertions(+), 140 deletions(-) rename src/tools/unicode-table-generator/src/range_search.rs => library/core/src/unicode/rt.rs (91%) diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index c71fa754e68fb..9bc4136517fae 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -18,6 +18,7 @@ pub(crate) use unicode_data::white_space::lookup as White_Space; pub(crate) mod printable; +mod rt; #[allow(unreachable_pub)] mod unicode_data; diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/library/core/src/unicode/rt.rs similarity index 91% rename from src/tools/unicode-table-generator/src/range_search.rs rename to library/core/src/unicode/rt.rs index 4d1dd9b423b59..566c3203dd1c0 100644 --- a/src/tools/unicode-table-generator/src/range_search.rs +++ b/library/core/src/unicode/rt.rs @@ -1,5 +1,7 @@ +//! Runtime support for `unicode_data`. + #[inline(always)] -const fn bitset_search< +pub(super) const fn bitset_search< const N: usize, const CHUNK_SIZE: usize, const N1: usize, @@ -46,10 +48,10 @@ const fn bitset_search< } #[repr(transparent)] -struct ShortOffsetRunHeader(u32); +pub(super) struct ShortOffsetRunHeader(pub(super) u32); impl ShortOffsetRunHeader { - const fn new(start_index: usize, prefix_sum: u32) -> Self { + pub(super) const fn new(start_index: usize, prefix_sum: u32) -> Self { assert!(start_index < (1 << 11)); assert!(prefix_sum < (1 << 21)); @@ -57,12 +59,12 @@ impl ShortOffsetRunHeader { } #[inline] - const fn start_index(&self) -> usize { + pub(super) const fn start_index(&self) -> usize { (self.0 >> 21) as usize } #[inline] - const fn prefix_sum(&self) -> u32 { + pub(super) const fn prefix_sum(&self) -> u32 { self.0 & ((1 << 21) - 1) } } @@ -72,7 +74,7 @@ impl ShortOffsetRunHeader { /// - The last element of `short_offset_runs` must be greater than `std::char::MAX`. /// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`. #[inline(always)] -unsafe fn skip_search( +pub(super) unsafe fn skip_search( needle: char, short_offset_runs: &[ShortOffsetRunHeader; SOR], offsets: &[u8; OFFSETS], diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 3c38b44224f87..90580e0740017 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -11,136 +11,8 @@ // to_upper : 13656 bytes // Total : 31911 bytes -#[inline(always)] -const fn bitset_search< - const N: usize, - const CHUNK_SIZE: usize, - const N1: usize, - const CANONICAL: usize, - const CANONICALIZED: usize, ->( - needle: u32, - chunk_idx_map: &[u8; N], - bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], - bitset_canonical: &[u64; CANONICAL], - bitset_canonicalized: &[(u8, u8); CANONICALIZED], -) -> bool { - let bucket_idx = (needle / 64) as usize; - let chunk_map_idx = bucket_idx / CHUNK_SIZE; - let chunk_piece = bucket_idx % CHUNK_SIZE; - // FIXME(const-hack): Revert to `slice::get` when slice indexing becomes possible in const. - let chunk_idx = if chunk_map_idx < chunk_idx_map.len() { - chunk_idx_map[chunk_map_idx] - } else { - return false; - }; - let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize; - // FIXME(const-hack): Revert to `slice::get` when slice indexing becomes possible in const. - let word = if idx < bitset_canonical.len() { - bitset_canonical[idx] - } else { - let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()]; - let mut word = bitset_canonical[real_idx as usize]; - let should_invert = mapping & (1 << 6) != 0; - if should_invert { - word = !word; - } - // Lower 6 bits - let quantity = mapping & ((1 << 6) - 1); - if mapping & (1 << 7) != 0 { - // shift - word >>= quantity as u64; - } else { - word = word.rotate_left(quantity as u32); - } - word - }; - (word & (1 << (needle % 64) as u64)) != 0 -} - -#[repr(transparent)] -struct ShortOffsetRunHeader(u32); - -impl ShortOffsetRunHeader { - const fn new(start_index: usize, prefix_sum: u32) -> Self { - assert!(start_index < (1 << 11)); - assert!(prefix_sum < (1 << 21)); - - Self((start_index as u32) << 21 | prefix_sum) - } - - #[inline] - const fn start_index(&self) -> usize { - (self.0 >> 21) as usize - } - - #[inline] - const fn prefix_sum(&self) -> u32 { - self.0 & ((1 << 21) - 1) - } -} - -/// # Safety -/// -/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`. -/// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`. -#[inline(always)] -unsafe fn skip_search( - needle: char, - short_offset_runs: &[ShortOffsetRunHeader; SOR], - offsets: &[u8; OFFSETS], -) -> bool { - let needle = needle as u32; - - let last_idx = - match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header.0 << 11) { - Ok(idx) => idx + 1, - Err(idx) => idx, - }; - // SAFETY: `last_idx` *cannot* be past the end of the array, as the last - // element is greater than `std::char::MAX` (the largest possible needle) - // as guaranteed by the caller. - // - // So, we cannot have found it (i.e. `Ok(idx) => idx + 1 != length`) and the - // correct location cannot be past it, so `Err(idx) => idx != length` either. - // - // This means that we can avoid bounds checking for the accesses below, too. - // - // We need to use `intrinsics::assume` since the `panic_nounwind` contained - // in `hint::assert_unchecked` may not be optimized out. - unsafe { crate::intrinsics::assume(last_idx < SOR) }; - - let mut offset_idx = short_offset_runs[last_idx].start_index(); - let length = if let Some(next) = short_offset_runs.get(last_idx + 1) { - (*next).start_index() - offset_idx - } else { - offsets.len() - offset_idx - }; - - let prev = - last_idx.checked_sub(1).map(|prev| short_offset_runs[prev].prefix_sum()).unwrap_or(0); - - let total = needle - prev; - let mut prefix_sum = 0; - for _ in 0..(length - 1) { - // SAFETY: It is guaranteed that `length <= OFFSETS - offset_idx`, - // so it follows that `length - 1 + offset_idx < OFFSETS`, therefore - // `offset_idx < OFFSETS` is always true in this loop. - // - // We need to use `intrinsics::assume` since the `panic_nounwind` contained - // in `hint::assert_unchecked` may not be optimized out. - unsafe { crate::intrinsics::assume(offset_idx < OFFSETS) }; - let offset = offsets[offset_idx]; - prefix_sum += offset as u32; - if prefix_sum > total { - break; - } - offset_idx += 1; - } - offset_idx % 2 == 1 -} - pub const UNICODE_VERSION: (u8, u8, u8) = (17, 0, 0); +use super::rt::*; #[rustfmt::skip] pub mod alphabetic { diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index ded9205ffc4b9..32118fc75281b 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -264,13 +264,9 @@ fn main() { } table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); - // Include the range search function table_file.push('\n'); - table_file.push_str(include_str!("range_search.rs")); - table_file.push('\n'); - table_file.push_str(&version()); - + table_file.push_str("use super::rt::*;\n"); table_file.push('\n'); modules.push((String::from("conversions"), conversions)); @@ -335,6 +331,7 @@ fn generate_tests(data: &UnicodeData) -> Result { writeln!(s, "// ignore-tidy-filelength")?; writeln!(s, "use std::intrinsics;")?; writeln!(s, "mod unicode_data;")?; + writeln!(s, "mod rt {{ {} }}", include_str!("../../../../library/core/src/unicode/rt.rs"))?; writeln!(s, "fn main() {{")?; for (property, ranges) in &data.ranges { let prop = property.to_lowercase(); From 81439b1cec7f687844fcea050d16fb38274f1bea Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sat, 11 Oct 2025 03:41:46 +0100 Subject: [PATCH 2/4] refactor: format `unicode_data` Run `rustfmt` on the generated tables. This means we won't have to worry so much about getting indetation and formatting right when generating code. Exempted for now some tables which would be too big when formatted by `rustfmt`. --- library/core/src/unicode/unicode_data.rs | 677 +++++++++++------- .../src/case_mapping.rs | 2 + src/tools/unicode-table-generator/src/main.rs | 6 +- 3 files changed, 412 insertions(+), 273 deletions(-) diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 90580e0740017..dd00c50c6966a 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -14,36 +14,60 @@ pub const UNICODE_VERSION: (u8, u8, u8) = (17, 0, 0); use super::rt::*; -#[rustfmt::skip] pub mod alphabetic { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 51] = [ - ShortOffsetRunHeader::new(0, 706), ShortOffsetRunHeader::new(12, 4681), - ShortOffsetRunHeader::new(414, 5741), ShortOffsetRunHeader::new(452, 7958), - ShortOffsetRunHeader::new(552, 9398), ShortOffsetRunHeader::new(623, 11264), - ShortOffsetRunHeader::new(625, 12293), ShortOffsetRunHeader::new(663, 13312), - ShortOffsetRunHeader::new(687, 19904), ShortOffsetRunHeader::new(688, 42125), - ShortOffsetRunHeader::new(690, 42509), ShortOffsetRunHeader::new(694, 55204), - ShortOffsetRunHeader::new(778, 63744), ShortOffsetRunHeader::new(783, 64110), - ShortOffsetRunHeader::new(784, 64830), ShortOffsetRunHeader::new(806, 66176), - ShortOffsetRunHeader::new(847, 67383), ShortOffsetRunHeader::new(894, 73440), - ShortOffsetRunHeader::new(1217, 74650), ShortOffsetRunHeader::new(1228, 77712), - ShortOffsetRunHeader::new(1233, 78896), ShortOffsetRunHeader::new(1236, 82939), - ShortOffsetRunHeader::new(1240, 83527), ShortOffsetRunHeader::new(1242, 90368), - ShortOffsetRunHeader::new(1243, 92160), ShortOffsetRunHeader::new(1245, 92729), - ShortOffsetRunHeader::new(1246, 93504), ShortOffsetRunHeader::new(1261, 101590), - ShortOffsetRunHeader::new(1282, 110576), ShortOffsetRunHeader::new(1287, 110883), - ShortOffsetRunHeader::new(1294, 111356), ShortOffsetRunHeader::new(1304, 113664), - ShortOffsetRunHeader::new(1305, 119808), ShortOffsetRunHeader::new(1315, 120486), - ShortOffsetRunHeader::new(1352, 122624), ShortOffsetRunHeader::new(1375, 123536), - ShortOffsetRunHeader::new(1399, 124112), ShortOffsetRunHeader::new(1403, 126464), - ShortOffsetRunHeader::new(1431, 127280), ShortOffsetRunHeader::new(1497, 131072), - ShortOffsetRunHeader::new(1503, 173792), ShortOffsetRunHeader::new(1504, 178206), - ShortOffsetRunHeader::new(1506, 183982), ShortOffsetRunHeader::new(1508, 191457), - ShortOffsetRunHeader::new(1510, 192094), ShortOffsetRunHeader::new(1512, 194560), - ShortOffsetRunHeader::new(1513, 195102), ShortOffsetRunHeader::new(1514, 196608), - ShortOffsetRunHeader::new(1515, 201547), ShortOffsetRunHeader::new(1516, 210042), + ShortOffsetRunHeader::new(0, 706), + ShortOffsetRunHeader::new(12, 4681), + ShortOffsetRunHeader::new(414, 5741), + ShortOffsetRunHeader::new(452, 7958), + ShortOffsetRunHeader::new(552, 9398), + ShortOffsetRunHeader::new(623, 11264), + ShortOffsetRunHeader::new(625, 12293), + ShortOffsetRunHeader::new(663, 13312), + ShortOffsetRunHeader::new(687, 19904), + ShortOffsetRunHeader::new(688, 42125), + ShortOffsetRunHeader::new(690, 42509), + ShortOffsetRunHeader::new(694, 55204), + ShortOffsetRunHeader::new(778, 63744), + ShortOffsetRunHeader::new(783, 64110), + ShortOffsetRunHeader::new(784, 64830), + ShortOffsetRunHeader::new(806, 66176), + ShortOffsetRunHeader::new(847, 67383), + ShortOffsetRunHeader::new(894, 73440), + ShortOffsetRunHeader::new(1217, 74650), + ShortOffsetRunHeader::new(1228, 77712), + ShortOffsetRunHeader::new(1233, 78896), + ShortOffsetRunHeader::new(1236, 82939), + ShortOffsetRunHeader::new(1240, 83527), + ShortOffsetRunHeader::new(1242, 90368), + ShortOffsetRunHeader::new(1243, 92160), + ShortOffsetRunHeader::new(1245, 92729), + ShortOffsetRunHeader::new(1246, 93504), + ShortOffsetRunHeader::new(1261, 101590), + ShortOffsetRunHeader::new(1282, 110576), + ShortOffsetRunHeader::new(1287, 110883), + ShortOffsetRunHeader::new(1294, 111356), + ShortOffsetRunHeader::new(1304, 113664), + ShortOffsetRunHeader::new(1305, 119808), + ShortOffsetRunHeader::new(1315, 120486), + ShortOffsetRunHeader::new(1352, 122624), + ShortOffsetRunHeader::new(1375, 123536), + ShortOffsetRunHeader::new(1399, 124112), + ShortOffsetRunHeader::new(1403, 126464), + ShortOffsetRunHeader::new(1431, 127280), + ShortOffsetRunHeader::new(1497, 131072), + ShortOffsetRunHeader::new(1503, 173792), + ShortOffsetRunHeader::new(1504, 178206), + ShortOffsetRunHeader::new(1506, 183982), + ShortOffsetRunHeader::new(1508, 191457), + ShortOffsetRunHeader::new(1510, 192094), + ShortOffsetRunHeader::new(1512, 194560), + ShortOffsetRunHeader::new(1513, 195102), + ShortOffsetRunHeader::new(1514, 196608), + ShortOffsetRunHeader::new(1515, 201547), + ShortOffsetRunHeader::new(1516, 210042), ShortOffsetRunHeader::new(1518, 1324154), ]; static OFFSETS: [u8; 1519] = [ @@ -52,57 +76,58 @@ pub mod alphabetic { 1, 2, 1, 2, 1, 1, 8, 27, 4, 4, 29, 11, 5, 56, 1, 7, 14, 102, 1, 8, 4, 8, 4, 3, 10, 3, 2, 1, 16, 48, 13, 101, 24, 33, 9, 2, 4, 1, 5, 24, 2, 19, 19, 25, 7, 11, 5, 24, 1, 7, 7, 1, 8, 42, 10, 12, 3, 7, 6, 76, 1, 16, 1, 3, 4, 15, 13, 19, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, 3, 4, 3, 8, - 2, 2, 2, 2, 1, 1, 8, 1, 4, 2, 1, 5, 12, 2, 10, 1, 4, 3, 1, 6, 4, 2, 2, 22, 1, 7, 1, 2, 1, 2, - 1, 2, 4, 5, 4, 2, 2, 2, 4, 1, 7, 4, 1, 1, 17, 6, 11, 3, 1, 9, 1, 3, 1, 22, 1, 7, 1, 2, 1, 5, - 3, 9, 1, 3, 1, 2, 3, 1, 15, 4, 21, 4, 4, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, - 2, 2, 9, 2, 4, 2, 1, 5, 13, 1, 16, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, 2, 3, 3, 3, 12, - 4, 5, 3, 3, 1, 3, 3, 1, 6, 1, 40, 13, 1, 3, 1, 23, 1, 16, 3, 8, 1, 3, 1, 3, 8, 2, 1, 3, 1, - 2, 2, 4, 28, 4, 1, 8, 1, 3, 1, 23, 1, 10, 1, 5, 3, 8, 1, 3, 1, 3, 8, 2, 5, 3, 1, 4, 13, 3, - 12, 13, 1, 3, 1, 41, 2, 8, 1, 3, 1, 3, 1, 1, 5, 4, 7, 5, 22, 6, 1, 3, 1, 18, 3, 24, 1, 9, 1, - 1, 2, 7, 8, 6, 1, 1, 1, 8, 18, 2, 13, 58, 5, 7, 6, 1, 51, 2, 1, 1, 1, 5, 1, 24, 1, 1, 1, 19, - 1, 3, 2, 5, 1, 1, 6, 1, 14, 4, 32, 1, 63, 8, 1, 36, 4, 19, 4, 16, 1, 36, 67, 55, 1, 1, 2, 5, - 16, 64, 10, 4, 2, 38, 1, 1, 5, 1, 2, 43, 1, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, 41, 1, 4, 2, 33, - 1, 4, 2, 7, 1, 1, 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 37, 16, 16, 86, 2, 6, 3, 0, 2, 17, 1, 26, - 5, 75, 3, 11, 7, 20, 11, 21, 12, 20, 12, 13, 1, 3, 1, 2, 12, 52, 2, 19, 14, 1, 4, 1, 67, 89, - 7, 43, 5, 70, 10, 31, 1, 12, 4, 9, 23, 30, 2, 5, 11, 44, 4, 26, 54, 28, 4, 63, 2, 20, 50, 1, - 23, 2, 11, 3, 49, 52, 1, 15, 1, 8, 51, 42, 2, 4, 10, 44, 1, 11, 14, 55, 22, 3, 10, 36, 2, - 11, 5, 43, 2, 3, 41, 4, 1, 6, 1, 2, 3, 1, 5, 192, 19, 34, 11, 0, 2, 6, 2, 38, 2, 6, 2, 8, 1, - 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, 1, - 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 11, 2, 4, 5, 5, - 4, 1, 17, 41, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 2, 56, 7, 1, 16, 23, 9, 7, 1, - 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 32, 47, 1, 0, 3, 25, 9, 7, 5, 2, 5, 4, 86, 6, 3, - 1, 90, 1, 4, 5, 43, 1, 94, 17, 32, 48, 16, 0, 0, 64, 0, 67, 46, 2, 0, 3, 16, 10, 2, 20, 47, - 5, 8, 3, 113, 39, 9, 2, 103, 2, 82, 20, 21, 1, 33, 24, 52, 12, 68, 1, 1, 44, 6, 3, 1, 1, 3, - 10, 33, 5, 35, 13, 29, 3, 51, 1, 12, 15, 1, 16, 16, 10, 5, 1, 55, 9, 14, 18, 23, 3, 69, 1, - 1, 1, 1, 24, 3, 2, 16, 2, 4, 11, 6, 2, 6, 2, 6, 9, 7, 1, 7, 1, 43, 1, 14, 6, 123, 21, 0, 12, - 23, 4, 49, 0, 0, 2, 106, 38, 7, 12, 5, 5, 12, 1, 13, 1, 5, 1, 1, 1, 2, 1, 2, 1, 108, 33, 0, - 18, 64, 2, 54, 40, 12, 116, 5, 1, 135, 36, 26, 6, 26, 11, 89, 3, 6, 2, 6, 2, 6, 2, 3, 35, - 12, 1, 26, 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 69, 53, 0, 29, 3, 49, 47, 32, 13, 30, 5, 43, - 5, 30, 2, 36, 4, 8, 1, 5, 42, 158, 18, 36, 4, 36, 4, 40, 8, 52, 12, 11, 1, 15, 1, 7, 1, 2, - 1, 11, 1, 15, 1, 7, 1, 2, 3, 52, 12, 0, 9, 22, 10, 8, 24, 6, 1, 42, 1, 9, 69, 6, 2, 1, 1, - 44, 1, 2, 3, 1, 2, 23, 10, 23, 9, 31, 65, 19, 1, 2, 10, 22, 10, 26, 6, 26, 38, 56, 6, 2, 64, - 4, 1, 2, 5, 8, 1, 3, 1, 29, 42, 29, 3, 29, 35, 8, 1, 28, 27, 54, 10, 22, 10, 19, 13, 18, - 110, 73, 55, 51, 13, 51, 13, 40, 34, 28, 3, 1, 5, 23, 250, 42, 1, 2, 3, 2, 16, 6, 50, 3, 3, - 29, 10, 1, 8, 22, 42, 18, 46, 21, 27, 23, 9, 70, 43, 5, 10, 57, 9, 1, 13, 25, 23, 51, 17, 4, - 8, 35, 3, 1, 9, 64, 1, 4, 9, 2, 10, 1, 1, 1, 35, 18, 1, 34, 2, 1, 6, 4, 62, 7, 1, 1, 1, 4, - 1, 15, 1, 10, 7, 57, 23, 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, 2, 2, 3, 1, 6, - 1, 5, 7, 28, 10, 1, 1, 2, 1, 1, 38, 1, 10, 1, 1, 2, 1, 1, 4, 1, 2, 3, 1, 1, 1, 44, 66, 1, 3, - 1, 4, 20, 3, 30, 66, 2, 2, 1, 1, 184, 54, 2, 7, 25, 6, 34, 63, 1, 1, 3, 1, 59, 54, 2, 1, 71, - 27, 2, 14, 21, 7, 185, 57, 103, 64, 31, 8, 2, 1, 2, 8, 1, 2, 1, 30, 1, 2, 2, 2, 2, 4, 93, 8, - 2, 46, 2, 6, 1, 1, 1, 2, 27, 51, 2, 10, 17, 72, 5, 1, 18, 73, 103, 8, 88, 33, 31, 9, 1, 45, - 1, 7, 1, 1, 49, 30, 2, 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, 3, 1, 1, 2, 2, 24, 6, - 1, 2, 1, 37, 1, 2, 1, 4, 1, 1, 23, 44, 0, 23, 9, 17, 1, 41, 3, 3, 111, 1, 79, 0, 102, 111, - 17, 196, 0, 97, 15, 0, 17, 6, 25, 0, 5, 0, 0, 47, 0, 0, 7, 31, 17, 79, 17, 30, 18, 48, 16, - 4, 31, 21, 5, 19, 0, 45, 211, 64, 32, 25, 2, 25, 44, 75, 4, 57, 7, 17, 64, 2, 1, 1, 12, 7, - 9, 0, 41, 32, 97, 115, 0, 4, 1, 7, 1, 2, 1, 0, 15, 1, 29, 3, 2, 1, 14, 4, 8, 0, 0, 107, 5, - 13, 3, 9, 7, 10, 4, 1, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, - 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, - 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 31, 6, 6, 213, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, - 1, 112, 45, 10, 7, 16, 1, 0, 30, 18, 44, 0, 28, 228, 30, 2, 1, 207, 31, 1, 22, 8, 2, 224, 7, - 1, 4, 1, 2, 1, 15, 1, 197, 59, 68, 3, 1, 3, 1, 0, 4, 1, 27, 1, 2, 1, 1, 2, 1, 1, 10, 1, 4, - 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, - 1, 2, 4, 1, 7, 1, 4, 1, 4, 1, 1, 1, 10, 1, 17, 5, 3, 1, 5, 1, 17, 0, 26, 6, 26, 6, 26, 0, 0, - 32, 0, 2, 0, 2, 0, 15, 0, 0, 0, 0, 0, 5, 0, 0, + 2, 2, 2, 2, 1, 1, 8, 1, 4, 2, 1, 5, 12, 2, 10, 1, 4, 3, 1, 6, 4, 2, 2, 22, 1, 7, 1, 2, 1, + 2, 1, 2, 4, 5, 4, 2, 2, 2, 4, 1, 7, 4, 1, 1, 17, 6, 11, 3, 1, 9, 1, 3, 1, 22, 1, 7, 1, 2, + 1, 5, 3, 9, 1, 3, 1, 2, 3, 1, 15, 4, 21, 4, 4, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, + 8, 2, 2, 2, 2, 9, 2, 4, 2, 1, 5, 13, 1, 16, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, 2, 3, + 3, 3, 12, 4, 5, 3, 3, 1, 3, 3, 1, 6, 1, 40, 13, 1, 3, 1, 23, 1, 16, 3, 8, 1, 3, 1, 3, 8, 2, + 1, 3, 1, 2, 2, 4, 28, 4, 1, 8, 1, 3, 1, 23, 1, 10, 1, 5, 3, 8, 1, 3, 1, 3, 8, 2, 5, 3, 1, + 4, 13, 3, 12, 13, 1, 3, 1, 41, 2, 8, 1, 3, 1, 3, 1, 1, 5, 4, 7, 5, 22, 6, 1, 3, 1, 18, 3, + 24, 1, 9, 1, 1, 2, 7, 8, 6, 1, 1, 1, 8, 18, 2, 13, 58, 5, 7, 6, 1, 51, 2, 1, 1, 1, 5, 1, + 24, 1, 1, 1, 19, 1, 3, 2, 5, 1, 1, 6, 1, 14, 4, 32, 1, 63, 8, 1, 36, 4, 19, 4, 16, 1, 36, + 67, 55, 1, 1, 2, 5, 16, 64, 10, 4, 2, 38, 1, 1, 5, 1, 2, 43, 1, 0, 1, 4, 2, 7, 1, 1, 1, 4, + 2, 41, 1, 4, 2, 33, 1, 4, 2, 7, 1, 1, 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 37, 16, 16, 86, 2, + 6, 3, 0, 2, 17, 1, 26, 5, 75, 3, 11, 7, 20, 11, 21, 12, 20, 12, 13, 1, 3, 1, 2, 12, 52, 2, + 19, 14, 1, 4, 1, 67, 89, 7, 43, 5, 70, 10, 31, 1, 12, 4, 9, 23, 30, 2, 5, 11, 44, 4, 26, + 54, 28, 4, 63, 2, 20, 50, 1, 23, 2, 11, 3, 49, 52, 1, 15, 1, 8, 51, 42, 2, 4, 10, 44, 1, + 11, 14, 55, 22, 3, 10, 36, 2, 11, 5, 43, 2, 3, 41, 4, 1, 6, 1, 2, 3, 1, 5, 192, 19, 34, 11, + 0, 2, 6, 2, 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, + 2, 6, 4, 13, 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, + 1, 1, 1, 1, 4, 1, 11, 2, 4, 5, 5, 4, 1, 17, 41, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, + 1, 2, 56, 7, 1, 16, 23, 9, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 32, 47, 1, 0, 3, + 25, 9, 7, 5, 2, 5, 4, 86, 6, 3, 1, 90, 1, 4, 5, 43, 1, 94, 17, 32, 48, 16, 0, 0, 64, 0, 67, + 46, 2, 0, 3, 16, 10, 2, 20, 47, 5, 8, 3, 113, 39, 9, 2, 103, 2, 82, 20, 21, 1, 33, 24, 52, + 12, 68, 1, 1, 44, 6, 3, 1, 1, 3, 10, 33, 5, 35, 13, 29, 3, 51, 1, 12, 15, 1, 16, 16, 10, 5, + 1, 55, 9, 14, 18, 23, 3, 69, 1, 1, 1, 1, 24, 3, 2, 16, 2, 4, 11, 6, 2, 6, 2, 6, 9, 7, 1, 7, + 1, 43, 1, 14, 6, 123, 21, 0, 12, 23, 4, 49, 0, 0, 2, 106, 38, 7, 12, 5, 5, 12, 1, 13, 1, 5, + 1, 1, 1, 2, 1, 2, 1, 108, 33, 0, 18, 64, 2, 54, 40, 12, 116, 5, 1, 135, 36, 26, 6, 26, 11, + 89, 3, 6, 2, 6, 2, 6, 2, 3, 35, 12, 1, 26, 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 69, 53, 0, + 29, 3, 49, 47, 32, 13, 30, 5, 43, 5, 30, 2, 36, 4, 8, 1, 5, 42, 158, 18, 36, 4, 36, 4, 40, + 8, 52, 12, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 3, 52, 12, 0, 9, 22, 10, 8, 24, + 6, 1, 42, 1, 9, 69, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 10, 23, 9, 31, 65, 19, 1, 2, 10, 22, + 10, 26, 6, 26, 38, 56, 6, 2, 64, 4, 1, 2, 5, 8, 1, 3, 1, 29, 42, 29, 3, 29, 35, 8, 1, 28, + 27, 54, 10, 22, 10, 19, 13, 18, 110, 73, 55, 51, 13, 51, 13, 40, 34, 28, 3, 1, 5, 23, 250, + 42, 1, 2, 3, 2, 16, 6, 50, 3, 3, 29, 10, 1, 8, 22, 42, 18, 46, 21, 27, 23, 9, 70, 43, 5, + 10, 57, 9, 1, 13, 25, 23, 51, 17, 4, 8, 35, 3, 1, 9, 64, 1, 4, 9, 2, 10, 1, 1, 1, 35, 18, + 1, 34, 2, 1, 6, 4, 62, 7, 1, 1, 1, 4, 1, 15, 1, 10, 7, 57, 23, 4, 1, 8, 2, 2, 2, 22, 1, 7, + 1, 2, 1, 5, 3, 8, 2, 2, 2, 2, 3, 1, 6, 1, 5, 7, 28, 10, 1, 1, 2, 1, 1, 38, 1, 10, 1, 1, 2, + 1, 1, 4, 1, 2, 3, 1, 1, 1, 44, 66, 1, 3, 1, 4, 20, 3, 30, 66, 2, 2, 1, 1, 184, 54, 2, 7, + 25, 6, 34, 63, 1, 1, 3, 1, 59, 54, 2, 1, 71, 27, 2, 14, 21, 7, 185, 57, 103, 64, 31, 8, 2, + 1, 2, 8, 1, 2, 1, 30, 1, 2, 2, 2, 2, 4, 93, 8, 2, 46, 2, 6, 1, 1, 1, 2, 27, 51, 2, 10, 17, + 72, 5, 1, 18, 73, 103, 8, 88, 33, 31, 9, 1, 45, 1, 7, 1, 1, 49, 30, 2, 22, 1, 14, 73, 7, 1, + 2, 1, 44, 3, 1, 1, 2, 1, 3, 1, 1, 2, 2, 24, 6, 1, 2, 1, 37, 1, 2, 1, 4, 1, 1, 23, 44, 0, + 23, 9, 17, 1, 41, 3, 3, 111, 1, 79, 0, 102, 111, 17, 196, 0, 97, 15, 0, 17, 6, 25, 0, 5, 0, + 0, 47, 0, 0, 7, 31, 17, 79, 17, 30, 18, 48, 16, 4, 31, 21, 5, 19, 0, 45, 211, 64, 32, 25, + 2, 25, 44, 75, 4, 57, 7, 17, 64, 2, 1, 1, 12, 7, 9, 0, 41, 32, 97, 115, 0, 4, 1, 7, 1, 2, + 1, 0, 15, 1, 29, 3, 2, 1, 14, 4, 8, 0, 0, 107, 5, 13, 3, 9, 7, 10, 4, 1, 0, 85, 1, 71, 1, + 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, + 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, + 31, 6, 6, 213, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 112, 45, 10, 7, 16, 1, 0, 30, 18, + 44, 0, 28, 228, 30, 2, 1, 207, 31, 1, 22, 8, 2, 224, 7, 1, 4, 1, 2, 1, 15, 1, 197, 59, 68, + 3, 1, 3, 1, 0, 4, 1, 27, 1, 2, 1, 1, 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, + 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, 1, 1, + 1, 10, 1, 17, 5, 3, 1, 5, 1, 17, 0, 26, 6, 26, 6, 26, 0, 0, 32, 0, 2, 0, 2, 0, 15, 0, 0, 0, + 0, 0, 5, 0, 0, ]; #[inline] pub fn lookup(c: char) -> bool { @@ -126,65 +151,82 @@ pub mod alphabetic { } } -#[rustfmt::skip] pub mod case_ignorable { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 36] = [ - ShortOffsetRunHeader::new(0, 688), ShortOffsetRunHeader::new(11, 4957), - ShortOffsetRunHeader::new(263, 5906), ShortOffsetRunHeader::new(265, 8125), - ShortOffsetRunHeader::new(377, 11388), ShortOffsetRunHeader::new(411, 12293), - ShortOffsetRunHeader::new(423, 40981), ShortOffsetRunHeader::new(435, 42232), - ShortOffsetRunHeader::new(437, 42508), ShortOffsetRunHeader::new(439, 64286), - ShortOffsetRunHeader::new(535, 65024), ShortOffsetRunHeader::new(539, 66045), - ShortOffsetRunHeader::new(569, 67456), ShortOffsetRunHeader::new(575, 68097), - ShortOffsetRunHeader::new(581, 68900), ShortOffsetRunHeader::new(593, 69291), - ShortOffsetRunHeader::new(601, 71727), ShortOffsetRunHeader::new(727, 71995), - ShortOffsetRunHeader::new(731, 73459), ShortOffsetRunHeader::new(797, 78896), - ShortOffsetRunHeader::new(809, 90398), ShortOffsetRunHeader::new(813, 92912), - ShortOffsetRunHeader::new(817, 93504), ShortOffsetRunHeader::new(823, 94031), - ShortOffsetRunHeader::new(827, 110576), ShortOffsetRunHeader::new(837, 113821), - ShortOffsetRunHeader::new(843, 118528), ShortOffsetRunHeader::new(847, 119143), - ShortOffsetRunHeader::new(851, 121344), ShortOffsetRunHeader::new(861, 122880), - ShortOffsetRunHeader::new(873, 123566), ShortOffsetRunHeader::new(889, 124139), - ShortOffsetRunHeader::new(893, 125136), ShortOffsetRunHeader::new(907, 127995), - ShortOffsetRunHeader::new(911, 917505), ShortOffsetRunHeader::new(913, 2032112), + ShortOffsetRunHeader::new(0, 688), + ShortOffsetRunHeader::new(11, 4957), + ShortOffsetRunHeader::new(263, 5906), + ShortOffsetRunHeader::new(265, 8125), + ShortOffsetRunHeader::new(377, 11388), + ShortOffsetRunHeader::new(411, 12293), + ShortOffsetRunHeader::new(423, 40981), + ShortOffsetRunHeader::new(435, 42232), + ShortOffsetRunHeader::new(437, 42508), + ShortOffsetRunHeader::new(439, 64286), + ShortOffsetRunHeader::new(535, 65024), + ShortOffsetRunHeader::new(539, 66045), + ShortOffsetRunHeader::new(569, 67456), + ShortOffsetRunHeader::new(575, 68097), + ShortOffsetRunHeader::new(581, 68900), + ShortOffsetRunHeader::new(593, 69291), + ShortOffsetRunHeader::new(601, 71727), + ShortOffsetRunHeader::new(727, 71995), + ShortOffsetRunHeader::new(731, 73459), + ShortOffsetRunHeader::new(797, 78896), + ShortOffsetRunHeader::new(809, 90398), + ShortOffsetRunHeader::new(813, 92912), + ShortOffsetRunHeader::new(817, 93504), + ShortOffsetRunHeader::new(823, 94031), + ShortOffsetRunHeader::new(827, 110576), + ShortOffsetRunHeader::new(837, 113821), + ShortOffsetRunHeader::new(843, 118528), + ShortOffsetRunHeader::new(847, 119143), + ShortOffsetRunHeader::new(851, 121344), + ShortOffsetRunHeader::new(861, 122880), + ShortOffsetRunHeader::new(873, 123566), + ShortOffsetRunHeader::new(889, 124139), + ShortOffsetRunHeader::new(893, 125136), + ShortOffsetRunHeader::new(907, 127995), + ShortOffsetRunHeader::new(911, 917505), + ShortOffsetRunHeader::new(913, 2032112), ]; static OFFSETS: [u8; 919] = [ 168, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, 1, 1, 251, 7, 207, 1, 5, 1, 49, - 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, 1, 10, 21, 16, 1, 101, 8, 1, 10, - 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, 24, 43, 3, 44, 1, 7, 2, 5, 9, 41, - 58, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, 58, 1, 4, 4, 8, 1, 20, 2, 26, 1, 2, - 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, - 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, 61, 1, 12, 1, 50, 1, 3, 1, 55, 1, 1, 3, - 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, 5, 2, 20, 2, 28, 2, 57, 2, 4, 4, 8, 1, - 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, 98, 1, 2, 9, 9, 1, 1, 7, 73, 2, 27, 1, - 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, 102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, - 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, 29, 2, 30, 2, 30, 2, 64, 2, 1, 7, 8, 1, - 2, 11, 3, 1, 5, 1, 45, 5, 51, 1, 65, 2, 34, 1, 118, 3, 4, 2, 9, 1, 6, 3, 219, 2, 2, 1, 58, - 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 46, 2, 12, 20, 4, 48, 1, 1, 5, 1, 1, 5, 1, - 40, 9, 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, 58, 8, 2, 2, 64, 6, 82, 3, 1, 13, - 1, 7, 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 101, 0, 1, 1, 3, 11, 3, 13, 3, 13, 3, 13, 2, 12, - 5, 8, 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, 16, 13, 51, 33, 0, 2, 113, 3, 125, 1, - 15, 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, 93, 3, 0, 1, 0, 6, 0, 1, 98, 4, 1, 10, - 1, 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 102, 4, 3, 2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, - 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, 2, 2, 17, 1, 21, 2, 66, 6, 2, 2, 2, 2, - 12, 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, 1, 1, 27, 1, 14, 2, 5, 2, 1, 1, 100, 5, - 9, 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 17, 0, 16, 3, 1, 12, 16, 34, 1, 2, 1, 169, 1, 7, 1, 6, - 1, 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, 0, 1, 226, 1, 149, 5, 0, 6, 1, 42, 1, 9, - 0, 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 38, 1, 26, 5, 1, 1, 0, 2, 24, 1, 52, 6, 70, 11, - 49, 4, 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 2, 1, 4, 1, 10, 1, 50, 3, 36, 5, 1, - 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 2, 1, 157, 1, 3, 8, 21, 2, 57, - 2, 3, 1, 37, 7, 3, 5, 70, 6, 13, 1, 1, 1, 1, 1, 14, 2, 85, 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, - 1, 4, 2, 1, 2, 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, 1, 101, - 1, 1, 1, 2, 4, 1, 5, 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, - 8, 1, 9, 6, 2, 3, 46, 13, 1, 2, 198, 1, 1, 3, 1, 1, 201, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, - 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2, 3, 1, 1, 1, 65, 1, 0, 2, 11, 2, 52, 5, 5, 1, 1, - 1, 23, 1, 0, 17, 6, 15, 0, 12, 3, 3, 0, 5, 59, 7, 9, 4, 0, 3, 40, 2, 0, 1, 63, 17, 64, 2, 1, - 2, 13, 2, 0, 4, 1, 7, 1, 2, 0, 2, 1, 4, 0, 46, 2, 23, 0, 3, 9, 16, 2, 7, 30, 4, 148, 3, 0, - 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160, 14, - 0, 1, 61, 4, 0, 5, 254, 2, 243, 1, 2, 1, 7, 2, 5, 1, 9, 1, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, - 128, 240, 0, + 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, 1, 10, 21, 16, 1, 101, 8, 1, + 10, 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, 24, 43, 3, 44, 1, 7, 2, 5, + 9, 41, 58, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, 58, 1, 4, 4, 8, 1, 20, 2, + 26, 1, 2, 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, 1, + 20, 2, 22, 6, 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, 61, 1, 12, 1, 50, 1, 3, 1, + 55, 1, 1, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, 5, 2, 20, 2, 28, 2, 57, 2, + 4, 4, 8, 1, 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, 98, 1, 2, 9, 9, 1, 1, 7, + 73, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, 102, 4, 1, 6, 1, 2, 2, 2, + 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, 29, 2, 30, 2, 30, 2, 64, + 2, 1, 7, 8, 1, 2, 11, 3, 1, 5, 1, 45, 5, 51, 1, 65, 2, 34, 1, 118, 3, 4, 2, 9, 1, 6, 3, + 219, 2, 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 46, 2, 12, 20, 4, 48, + 1, 1, 5, 1, 1, 5, 1, 40, 9, 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, 58, 8, 2, 2, + 64, 6, 82, 3, 1, 13, 1, 7, 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 101, 0, 1, 1, 3, 11, 3, 13, + 3, 13, 3, 13, 2, 12, 5, 8, 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, 16, 13, 51, 33, + 0, 2, 113, 3, 125, 1, 15, 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, 93, 3, 0, 1, 0, + 6, 0, 1, 98, 4, 1, 10, 1, 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 102, 4, 3, 2, 8, 1, 3, 1, + 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, 2, 2, 17, 1, + 21, 2, 66, 6, 2, 2, 2, 2, 12, 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, 1, 1, 27, 1, + 14, 2, 5, 2, 1, 1, 100, 5, 9, 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 17, 0, 16, 3, 1, 12, 16, + 34, 1, 2, 1, 169, 1, 7, 1, 6, 1, 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, 0, 1, 226, + 1, 149, 5, 0, 6, 1, 42, 1, 9, 0, 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 38, 1, 26, 5, 1, + 1, 0, 2, 24, 1, 52, 6, 70, 11, 49, 4, 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 2, + 1, 4, 1, 10, 1, 50, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, + 1, 2, 1, 157, 1, 3, 8, 21, 2, 57, 2, 3, 1, 37, 7, 3, 5, 70, 6, 13, 1, 1, 1, 1, 1, 14, 2, + 85, 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, 2, 1, 2, 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, + 1, 2, 106, 1, 1, 1, 2, 6, 1, 1, 101, 1, 1, 1, 2, 4, 1, 5, 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, + 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46, 13, 1, 2, 198, 1, 1, 3, 1, + 1, 201, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2, 3, + 1, 1, 1, 65, 1, 0, 2, 11, 2, 52, 5, 5, 1, 1, 1, 23, 1, 0, 17, 6, 15, 0, 12, 3, 3, 0, 5, 59, + 7, 9, 4, 0, 3, 40, 2, 0, 1, 63, 17, 64, 2, 1, 2, 13, 2, 0, 4, 1, 7, 1, 2, 0, 2, 1, 4, 0, + 46, 2, 23, 0, 3, 9, 16, 2, 7, 30, 4, 148, 3, 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, + 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160, 14, 0, 1, 61, 4, 0, 5, 254, 2, 243, 1, 2, 1, 7, + 2, 5, 1, 9, 1, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0, ]; #[inline] pub fn lookup(c: char) -> bool { @@ -208,36 +250,46 @@ pub mod case_ignorable { } } -#[rustfmt::skip] pub mod cased { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [ - ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(51, 5024), - ShortOffsetRunHeader::new(61, 7296), ShortOffsetRunHeader::new(65, 7958), - ShortOffsetRunHeader::new(74, 9398), ShortOffsetRunHeader::new(149, 11264), - ShortOffsetRunHeader::new(151, 42560), ShortOffsetRunHeader::new(163, 43824), - ShortOffsetRunHeader::new(177, 64256), ShortOffsetRunHeader::new(183, 65313), - ShortOffsetRunHeader::new(187, 66560), ShortOffsetRunHeader::new(191, 67456), - ShortOffsetRunHeader::new(213, 68736), ShortOffsetRunHeader::new(221, 71840), - ShortOffsetRunHeader::new(229, 93760), ShortOffsetRunHeader::new(231, 119808), - ShortOffsetRunHeader::new(237, 120486), ShortOffsetRunHeader::new(274, 122624), - ShortOffsetRunHeader::new(297, 122928), ShortOffsetRunHeader::new(303, 125184), - ShortOffsetRunHeader::new(305, 127280), ShortOffsetRunHeader::new(307, 1241482), + ShortOffsetRunHeader::new(0, 4256), + ShortOffsetRunHeader::new(51, 5024), + ShortOffsetRunHeader::new(61, 7296), + ShortOffsetRunHeader::new(65, 7958), + ShortOffsetRunHeader::new(74, 9398), + ShortOffsetRunHeader::new(149, 11264), + ShortOffsetRunHeader::new(151, 42560), + ShortOffsetRunHeader::new(163, 43824), + ShortOffsetRunHeader::new(177, 64256), + ShortOffsetRunHeader::new(183, 65313), + ShortOffsetRunHeader::new(187, 66560), + ShortOffsetRunHeader::new(191, 67456), + ShortOffsetRunHeader::new(213, 68736), + ShortOffsetRunHeader::new(221, 71840), + ShortOffsetRunHeader::new(229, 93760), + ShortOffsetRunHeader::new(231, 119808), + ShortOffsetRunHeader::new(237, 120486), + ShortOffsetRunHeader::new(274, 122624), + ShortOffsetRunHeader::new(297, 122928), + ShortOffsetRunHeader::new(303, 125184), + ShortOffsetRunHeader::new(305, 127280), + ShortOffsetRunHeader::new(307, 1241482), ]; static OFFSETS: [u8; 313] = [ 170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 2, 35, 7, 2, 30, 5, 96, 1, 42, 4, - 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, 1, - 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, 2, 8, - 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, - 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, 1, 2, 4, - 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, 18, 30, 132, - 102, 3, 4, 1, 77, 20, 6, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6, 26, 0, 80, 96, - 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0, - 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 32, 25, 2, 25, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, - 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, - 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0, - 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, + 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, + 1, 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, + 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, + 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, + 1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, + 18, 30, 132, 102, 3, 4, 1, 77, 20, 6, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6, 26, + 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3, 1, + 42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 32, 25, 2, 25, 0, 85, 1, 71, 1, 2, + 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, + 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, + 1, 20, 6, 6, 0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, ]; #[inline] pub fn lookup(c: char) -> bool { @@ -261,58 +313,73 @@ pub mod cased { } } -#[rustfmt::skip] pub mod grapheme_extend { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 33] = [ - ShortOffsetRunHeader::new(0, 768), ShortOffsetRunHeader::new(1, 1155), - ShortOffsetRunHeader::new(3, 1425), ShortOffsetRunHeader::new(5, 4957), - ShortOffsetRunHeader::new(249, 5906), ShortOffsetRunHeader::new(251, 8204), - ShortOffsetRunHeader::new(347, 11503), ShortOffsetRunHeader::new(351, 12330), - ShortOffsetRunHeader::new(357, 42607), ShortOffsetRunHeader::new(361, 43010), - ShortOffsetRunHeader::new(369, 64286), ShortOffsetRunHeader::new(435, 65024), - ShortOffsetRunHeader::new(437, 65438), ShortOffsetRunHeader::new(441, 66045), - ShortOffsetRunHeader::new(443, 68097), ShortOffsetRunHeader::new(449, 68900), - ShortOffsetRunHeader::new(461, 69291), ShortOffsetRunHeader::new(465, 71727), - ShortOffsetRunHeader::new(601, 73459), ShortOffsetRunHeader::new(669, 78912), - ShortOffsetRunHeader::new(679, 90398), ShortOffsetRunHeader::new(683, 92912), - ShortOffsetRunHeader::new(687, 94031), ShortOffsetRunHeader::new(691, 113821), - ShortOffsetRunHeader::new(699, 118528), ShortOffsetRunHeader::new(701, 119141), - ShortOffsetRunHeader::new(705, 121344), ShortOffsetRunHeader::new(717, 122880), - ShortOffsetRunHeader::new(729, 123566), ShortOffsetRunHeader::new(743, 124140), - ShortOffsetRunHeader::new(747, 125136), ShortOffsetRunHeader::new(759, 917536), + ShortOffsetRunHeader::new(0, 768), + ShortOffsetRunHeader::new(1, 1155), + ShortOffsetRunHeader::new(3, 1425), + ShortOffsetRunHeader::new(5, 4957), + ShortOffsetRunHeader::new(249, 5906), + ShortOffsetRunHeader::new(251, 8204), + ShortOffsetRunHeader::new(347, 11503), + ShortOffsetRunHeader::new(351, 12330), + ShortOffsetRunHeader::new(357, 42607), + ShortOffsetRunHeader::new(361, 43010), + ShortOffsetRunHeader::new(369, 64286), + ShortOffsetRunHeader::new(435, 65024), + ShortOffsetRunHeader::new(437, 65438), + ShortOffsetRunHeader::new(441, 66045), + ShortOffsetRunHeader::new(443, 68097), + ShortOffsetRunHeader::new(449, 68900), + ShortOffsetRunHeader::new(461, 69291), + ShortOffsetRunHeader::new(465, 71727), + ShortOffsetRunHeader::new(601, 73459), + ShortOffsetRunHeader::new(669, 78912), + ShortOffsetRunHeader::new(679, 90398), + ShortOffsetRunHeader::new(683, 92912), + ShortOffsetRunHeader::new(687, 94031), + ShortOffsetRunHeader::new(691, 113821), + ShortOffsetRunHeader::new(699, 118528), + ShortOffsetRunHeader::new(701, 119141), + ShortOffsetRunHeader::new(705, 121344), + ShortOffsetRunHeader::new(717, 122880), + ShortOffsetRunHeader::new(729, 123566), + ShortOffsetRunHeader::new(743, 124140), + ShortOffsetRunHeader::new(747, 125136), + ShortOffsetRunHeader::new(759, 917536), ShortOffsetRunHeader::new(763, 2032112), ]; static OFFSETS: [u8; 767] = [ 0, 112, 0, 7, 0, 45, 1, 1, 1, 2, 1, 2, 1, 1, 72, 11, 48, 21, 16, 1, 101, 7, 2, 6, 2, 2, 1, - 4, 35, 1, 30, 27, 91, 11, 58, 9, 9, 1, 24, 4, 1, 9, 1, 3, 1, 5, 43, 3, 59, 9, 42, 24, 1, 32, - 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 29, 1, 58, 1, 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 26, 1, 2, - 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, - 1, 1, 58, 1, 1, 2, 1, 4, 8, 1, 7, 3, 10, 2, 30, 1, 59, 1, 1, 1, 12, 1, 9, 1, 40, 1, 3, 1, - 55, 1, 1, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 2, 1, 1, 3, 3, 1, 4, 7, 2, 11, 2, 28, - 2, 57, 2, 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 29, 1, 72, 1, 4, 1, 2, 3, 1, 1, 8, 1, 81, 1, 2, 7, - 12, 8, 98, 1, 2, 9, 11, 7, 73, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, - 102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 0, 3, 0, 4, 28, 3, - 29, 2, 30, 2, 64, 2, 1, 7, 8, 1, 2, 11, 9, 1, 45, 3, 1, 1, 117, 2, 34, 1, 118, 3, 4, 2, 9, - 1, 6, 3, 219, 2, 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 48, 46, 2, 12, 20, 4, 48, - 10, 4, 3, 38, 9, 12, 2, 32, 4, 2, 6, 56, 1, 1, 2, 3, 1, 1, 5, 56, 8, 2, 2, 152, 3, 1, 13, 1, - 7, 4, 1, 6, 1, 3, 2, 198, 64, 0, 1, 195, 33, 0, 3, 141, 1, 96, 32, 0, 6, 105, 2, 0, 4, 1, - 10, 32, 2, 80, 2, 0, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 1, 1, - 44, 3, 48, 1, 2, 4, 2, 2, 2, 1, 36, 1, 67, 6, 2, 2, 2, 2, 12, 1, 8, 1, 47, 1, 51, 1, 1, 3, - 2, 2, 5, 2, 1, 1, 42, 2, 8, 1, 238, 1, 2, 1, 4, 1, 0, 1, 0, 16, 16, 16, 0, 2, 0, 1, 226, 1, - 149, 5, 0, 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 65, 5, 0, 2, 77, 6, 70, 11, 49, 4, 123, - 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 7, 1, 61, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, - 1, 1, 8, 4, 2, 1, 95, 3, 2, 4, 6, 1, 2, 1, 157, 1, 3, 8, 21, 2, 57, 2, 1, 1, 1, 1, 12, 1, 9, - 1, 14, 7, 3, 5, 67, 1, 2, 6, 1, 1, 2, 1, 1, 3, 4, 3, 1, 1, 14, 2, 85, 8, 2, 3, 1, 1, 23, 1, - 81, 1, 2, 6, 1, 1, 2, 1, 1, 2, 1, 2, 235, 1, 2, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, - 106, 1, 1, 1, 2, 8, 101, 1, 1, 1, 2, 4, 1, 5, 0, 9, 1, 2, 245, 1, 10, 4, 4, 1, 144, 4, 2, 2, - 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46, 13, 1, 2, 198, 1, 1, 3, 1, 1, 201, 7, 1, 6, - 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2, 3, 1, 1, 1, 0, 2, 11, - 2, 52, 5, 5, 3, 23, 1, 0, 1, 6, 15, 0, 12, 3, 3, 0, 5, 59, 7, 0, 1, 63, 4, 81, 1, 11, 2, 0, - 2, 0, 46, 2, 23, 0, 5, 3, 6, 8, 8, 2, 7, 30, 4, 148, 3, 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, - 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 100, 1, 160, 7, 0, 1, 61, 4, 0, 4, 254, 2, 243, 1, 2, 1, - 7, 2, 5, 1, 0, 7, 109, 7, 0, 96, 128, 240, 0, + 4, 35, 1, 30, 27, 91, 11, 58, 9, 9, 1, 24, 4, 1, 9, 1, 3, 1, 5, 43, 3, 59, 9, 42, 24, 1, + 32, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 29, 1, 58, 1, 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 26, + 1, 2, 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, + 22, 6, 1, 1, 58, 1, 1, 2, 1, 4, 8, 1, 7, 3, 10, 2, 30, 1, 59, 1, 1, 1, 12, 1, 9, 1, 40, 1, + 3, 1, 55, 1, 1, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 2, 1, 1, 3, 3, 1, 4, 7, 2, 11, + 2, 28, 2, 57, 2, 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 29, 1, 72, 1, 4, 1, 2, 3, 1, 1, 8, 1, 81, + 1, 2, 7, 12, 8, 98, 1, 2, 9, 11, 7, 73, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, + 36, 9, 1, 102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 0, 3, 0, + 4, 28, 3, 29, 2, 30, 2, 64, 2, 1, 7, 8, 1, 2, 11, 9, 1, 45, 3, 1, 1, 117, 2, 34, 1, 118, 3, + 4, 2, 9, 1, 6, 3, 219, 2, 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 48, 46, 2, 12, + 20, 4, 48, 10, 4, 3, 38, 9, 12, 2, 32, 4, 2, 6, 56, 1, 1, 2, 3, 1, 1, 5, 56, 8, 2, 2, 152, + 3, 1, 13, 1, 7, 4, 1, 6, 1, 3, 2, 198, 64, 0, 1, 195, 33, 0, 3, 141, 1, 96, 32, 0, 6, 105, + 2, 0, 4, 1, 10, 32, 2, 80, 2, 0, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, + 25, 11, 1, 1, 44, 3, 48, 1, 2, 4, 2, 2, 2, 1, 36, 1, 67, 6, 2, 2, 2, 2, 12, 1, 8, 1, 47, 1, + 51, 1, 1, 3, 2, 2, 5, 2, 1, 1, 42, 2, 8, 1, 238, 1, 2, 1, 4, 1, 0, 1, 0, 16, 16, 16, 0, 2, + 0, 1, 226, 1, 149, 5, 0, 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 65, 5, 0, 2, 77, 6, 70, + 11, 49, 4, 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 7, 1, 61, 3, 36, 5, 1, 8, 62, + 1, 12, 2, 52, 9, 1, 1, 8, 4, 2, 1, 95, 3, 2, 4, 6, 1, 2, 1, 157, 1, 3, 8, 21, 2, 57, 2, 1, + 1, 1, 1, 12, 1, 9, 1, 14, 7, 3, 5, 67, 1, 2, 6, 1, 1, 2, 1, 1, 3, 4, 3, 1, 1, 14, 2, 85, 8, + 2, 3, 1, 1, 23, 1, 81, 1, 2, 6, 1, 1, 2, 1, 1, 2, 1, 2, 235, 1, 2, 4, 6, 2, 1, 2, 27, 2, + 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 8, 101, 1, 1, 1, 2, 4, 1, 5, 0, 9, 1, 2, 245, 1, 10, 4, + 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46, 13, 1, 2, 198, 1, 1, + 3, 1, 1, 201, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, + 2, 3, 1, 1, 1, 0, 2, 11, 2, 52, 5, 5, 3, 23, 1, 0, 1, 6, 15, 0, 12, 3, 3, 0, 5, 59, 7, 0, + 1, 63, 4, 81, 1, 11, 2, 0, 2, 0, 46, 2, 23, 0, 5, 3, 6, 8, 8, 2, 7, 30, 4, 148, 3, 0, 55, + 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 100, 1, 160, 7, 0, 1, 61, + 4, 0, 4, 254, 2, 243, 1, 2, 1, 7, 2, 5, 1, 0, 7, 109, 7, 0, 96, 128, 240, 0, ]; #[inline] pub fn lookup(c: char) -> bool { @@ -336,14 +403,13 @@ pub mod grapheme_extend { } } -#[rustfmt::skip] pub mod lowercase { static BITSET_CHUNKS_MAP: [u8; 123] = [ 12, 17, 0, 0, 9, 0, 0, 13, 14, 10, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 4, 1, 0, 15, 0, 8, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, - 3, 18, 0, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 4, 1, 0, 15, 0, 8, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, + 0, 3, 18, 0, 7, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 20] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], @@ -427,65 +493,105 @@ pub mod lowercase { 0b1110101111000000000000000000000000001111111111111111111111111100, ]; static BITSET_MAPPING: [(u8, u8); 22] = [ - (0, 64), (1, 184), (1, 182), (1, 179), (1, 172), (1, 168), (1, 161), (1, 146), (1, 144), - (1, 140), (1, 136), (1, 132), (2, 146), (2, 144), (2, 83), (3, 93), (3, 147), (3, 133), - (4, 12), (4, 6), (5, 187), (6, 78), + (0, 64), + (1, 184), + (1, 182), + (1, 179), + (1, 172), + (1, 168), + (1, 161), + (1, 146), + (1, 144), + (1, 140), + (1, 136), + (1, 132), + (2, 146), + (2, 144), + (2, 83), + (3, 93), + (3, 147), + (3, 133), + (4, 12), + (4, 6), + (5, 187), + (6, 78), ]; pub const fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); - (c as u32) >= 0xaa && - super::bitset_search( - c as u32, - &BITSET_CHUNKS_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, - ) + (c as u32) >= 0xaa + && super::bitset_search( + c as u32, + &BITSET_CHUNKS_MAP, + &BITSET_INDEX_CHUNKS, + &BITSET_CANONICAL, + &BITSET_MAPPING, + ) } } -#[rustfmt::skip] pub mod n { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 43] = [ - ShortOffsetRunHeader::new(0, 1632), ShortOffsetRunHeader::new(7, 2406), - ShortOffsetRunHeader::new(13, 4160), ShortOffsetRunHeader::new(47, 4969), - ShortOffsetRunHeader::new(51, 5870), ShortOffsetRunHeader::new(53, 6470), - ShortOffsetRunHeader::new(61, 8304), ShortOffsetRunHeader::new(77, 9312), - ShortOffsetRunHeader::new(87, 10102), ShortOffsetRunHeader::new(91, 11517), - ShortOffsetRunHeader::new(93, 12295), ShortOffsetRunHeader::new(95, 12690), - ShortOffsetRunHeader::new(101, 42528), ShortOffsetRunHeader::new(113, 43056), - ShortOffsetRunHeader::new(117, 44016), ShortOffsetRunHeader::new(129, 65296), - ShortOffsetRunHeader::new(131, 65799), ShortOffsetRunHeader::new(133, 66273), - ShortOffsetRunHeader::new(139, 67672), ShortOffsetRunHeader::new(151, 68858), - ShortOffsetRunHeader::new(181, 69216), ShortOffsetRunHeader::new(187, 70736), - ShortOffsetRunHeader::new(207, 71248), ShortOffsetRunHeader::new(211, 71904), - ShortOffsetRunHeader::new(219, 72688), ShortOffsetRunHeader::new(223, 73552), - ShortOffsetRunHeader::new(233, 74752), ShortOffsetRunHeader::new(237, 90416), - ShortOffsetRunHeader::new(239, 92768), ShortOffsetRunHeader::new(241, 93552), - ShortOffsetRunHeader::new(249, 93824), ShortOffsetRunHeader::new(251, 94196), - ShortOffsetRunHeader::new(253, 118000), ShortOffsetRunHeader::new(255, 119488), - ShortOffsetRunHeader::new(257, 120782), ShortOffsetRunHeader::new(263, 123200), - ShortOffsetRunHeader::new(265, 123632), ShortOffsetRunHeader::new(267, 124144), - ShortOffsetRunHeader::new(269, 125127), ShortOffsetRunHeader::new(273, 126065), - ShortOffsetRunHeader::new(277, 127232), ShortOffsetRunHeader::new(287, 130032), + ShortOffsetRunHeader::new(0, 1632), + ShortOffsetRunHeader::new(7, 2406), + ShortOffsetRunHeader::new(13, 4160), + ShortOffsetRunHeader::new(47, 4969), + ShortOffsetRunHeader::new(51, 5870), + ShortOffsetRunHeader::new(53, 6470), + ShortOffsetRunHeader::new(61, 8304), + ShortOffsetRunHeader::new(77, 9312), + ShortOffsetRunHeader::new(87, 10102), + ShortOffsetRunHeader::new(91, 11517), + ShortOffsetRunHeader::new(93, 12295), + ShortOffsetRunHeader::new(95, 12690), + ShortOffsetRunHeader::new(101, 42528), + ShortOffsetRunHeader::new(113, 43056), + ShortOffsetRunHeader::new(117, 44016), + ShortOffsetRunHeader::new(129, 65296), + ShortOffsetRunHeader::new(131, 65799), + ShortOffsetRunHeader::new(133, 66273), + ShortOffsetRunHeader::new(139, 67672), + ShortOffsetRunHeader::new(151, 68858), + ShortOffsetRunHeader::new(181, 69216), + ShortOffsetRunHeader::new(187, 70736), + ShortOffsetRunHeader::new(207, 71248), + ShortOffsetRunHeader::new(211, 71904), + ShortOffsetRunHeader::new(219, 72688), + ShortOffsetRunHeader::new(223, 73552), + ShortOffsetRunHeader::new(233, 74752), + ShortOffsetRunHeader::new(237, 90416), + ShortOffsetRunHeader::new(239, 92768), + ShortOffsetRunHeader::new(241, 93552), + ShortOffsetRunHeader::new(249, 93824), + ShortOffsetRunHeader::new(251, 94196), + ShortOffsetRunHeader::new(253, 118000), + ShortOffsetRunHeader::new(255, 119488), + ShortOffsetRunHeader::new(257, 120782), + ShortOffsetRunHeader::new(263, 123200), + ShortOffsetRunHeader::new(265, 123632), + ShortOffsetRunHeader::new(267, 124144), + ShortOffsetRunHeader::new(269, 125127), + ShortOffsetRunHeader::new(273, 126065), + ShortOffsetRunHeader::new(277, 127232), + ShortOffsetRunHeader::new(287, 130032), ShortOffsetRunHeader::new(289, 1244154), ]; static OFFSETS: [u8; 291] = [ 178, 2, 5, 1, 2, 3, 0, 10, 134, 10, 198, 10, 0, 10, 118, 10, 4, 6, 108, 10, 118, 10, 118, 10, 2, 6, 110, 13, 115, 10, 8, 7, 103, 10, 104, 7, 7, 19, 109, 10, 96, 10, 118, 10, 70, 20, - 0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, 182, 10, - 86, 10, 134, 10, 6, 10, 0, 1, 3, 6, 6, 10, 198, 51, 2, 5, 0, 60, 78, 22, 0, 30, 0, 1, 0, 1, - 25, 9, 14, 3, 0, 4, 138, 10, 30, 8, 1, 15, 32, 10, 39, 15, 0, 10, 188, 10, 0, 6, 154, 10, - 38, 10, 198, 10, 22, 10, 86, 10, 0, 10, 0, 10, 0, 45, 12, 57, 17, 2, 0, 27, 36, 4, 29, 1, 8, - 1, 134, 5, 202, 10, 0, 8, 25, 7, 39, 9, 75, 5, 22, 6, 160, 2, 2, 16, 2, 46, 64, 9, 52, 2, - 30, 3, 75, 5, 104, 8, 24, 8, 41, 7, 0, 6, 48, 10, 6, 10, 0, 31, 158, 10, 42, 4, 112, 7, 134, - 30, 128, 10, 60, 10, 144, 10, 7, 20, 251, 10, 0, 10, 118, 10, 0, 10, 102, 10, 6, 20, 76, 12, - 0, 19, 93, 10, 0, 10, 86, 29, 227, 10, 70, 10, 54, 10, 0, 10, 102, 21, 0, 111, 0, 10, 0, 10, - 86, 10, 134, 10, 1, 7, 0, 10, 0, 23, 0, 3, 0, 10, 0, 20, 12, 20, 108, 25, 0, 50, 0, 10, 0, - 10, 0, 10, 247, 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0, + 0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, 182, + 10, 86, 10, 134, 10, 6, 10, 0, 1, 3, 6, 6, 10, 198, 51, 2, 5, 0, 60, 78, 22, 0, 30, 0, 1, + 0, 1, 25, 9, 14, 3, 0, 4, 138, 10, 30, 8, 1, 15, 32, 10, 39, 15, 0, 10, 188, 10, 0, 6, 154, + 10, 38, 10, 198, 10, 22, 10, 86, 10, 0, 10, 0, 10, 0, 45, 12, 57, 17, 2, 0, 27, 36, 4, 29, + 1, 8, 1, 134, 5, 202, 10, 0, 8, 25, 7, 39, 9, 75, 5, 22, 6, 160, 2, 2, 16, 2, 46, 64, 9, + 52, 2, 30, 3, 75, 5, 104, 8, 24, 8, 41, 7, 0, 6, 48, 10, 6, 10, 0, 31, 158, 10, 42, 4, 112, + 7, 134, 30, 128, 10, 60, 10, 144, 10, 7, 20, 251, 10, 0, 10, 118, 10, 0, 10, 102, 10, 6, + 20, 76, 12, 0, 19, 93, 10, 0, 10, 86, 29, 227, 10, 70, 10, 54, 10, 0, 10, 102, 21, 0, 111, + 0, 10, 0, 10, 86, 10, 134, 10, 1, 7, 0, 10, 0, 23, 0, 3, 0, 10, 0, 20, 12, 20, 108, 25, 0, + 50, 0, 10, 0, 10, 0, 10, 247, 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, + 0, 10, 0, ]; #[inline] pub fn lookup(c: char) -> bool { @@ -509,14 +615,13 @@ pub mod n { } } -#[rustfmt::skip] pub mod uppercase { static BITSET_CHUNKS_MAP: [u8; 125] = [ 3, 14, 6, 6, 0, 6, 6, 2, 5, 12, 6, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 7, 6, 13, 6, 11, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 16, 6, 6, - 6, 6, 10, 6, 4, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 7, 6, 13, 6, 11, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 16, 6, + 6, 6, 6, 10, 6, 4, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [ [44, 44, 5, 35, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 5, 0], @@ -584,36 +689,57 @@ pub mod uppercase { 0b1111111100000000111111110000000000111111000000001111111100000000, ]; static BITSET_MAPPING: [(u8, u8); 25] = [ - (0, 182), (0, 74), (0, 166), (0, 162), (0, 159), (0, 150), (0, 148), (0, 142), (0, 134), - (0, 131), (0, 64), (1, 66), (1, 70), (1, 83), (1, 12), (1, 8), (2, 146), (2, 140), (2, 134), - (2, 130), (3, 164), (3, 146), (3, 20), (4, 178), (4, 171), + (0, 182), + (0, 74), + (0, 166), + (0, 162), + (0, 159), + (0, 150), + (0, 148), + (0, 142), + (0, 134), + (0, 131), + (0, 64), + (1, 66), + (1, 70), + (1, 83), + (1, 12), + (1, 8), + (2, 146), + (2, 140), + (2, 134), + (2, 130), + (3, 164), + (3, 146), + (3, 20), + (4, 178), + (4, 171), ]; pub const fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); - (c as u32) >= 0xc0 && - super::bitset_search( - c as u32, - &BITSET_CHUNKS_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, - ) + (c as u32) >= 0xc0 + && super::bitset_search( + c as u32, + &BITSET_CHUNKS_MAP, + &BITSET_INDEX_CHUNKS, + &BITSET_CANONICAL, + &BITSET_MAPPING, + ) } } -#[rustfmt::skip] pub mod white_space { static WHITESPACE_MAP: [u8; 256] = [ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; #[inline] pub const fn lookup(c: char) -> bool { @@ -628,7 +754,6 @@ pub mod white_space { } } -#[rustfmt::skip] pub mod conversions { const INDEX_MASK: u32 = 0x400000; @@ -642,7 +767,9 @@ pub mod conversions { let u = LOWERCASE_TABLE[i].1; char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| { // SAFETY: Index comes from statically generated table - unsafe { *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) } + unsafe { + *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) + } }) }) .unwrap_or([c, '\0', '\0']) @@ -659,13 +786,16 @@ pub mod conversions { let u = UPPERCASE_TABLE[i].1; char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| { // SAFETY: Index comes from statically generated table - unsafe { *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) } + unsafe { + *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) + } }) }) .unwrap_or([c, '\0', '\0']) } } + #[rustfmt::skip] static LOWERCASE_TABLE: &[(char, u32); 1462] = &[ ('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228), ('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233), @@ -1023,10 +1153,12 @@ pub mod conversions { ('\u{1e921}', 125251), ]; + #[rustfmt::skip] static LOWERCASE_TABLE_MULTI: &[[char; 3]; 1] = &[ ['i', '\u{307}', '\u{0}'], ]; + #[rustfmt::skip] static UPPERCASE_TABLE: &[(char, u32); 1554] = &[ ('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194), ('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199), @@ -1407,6 +1539,7 @@ pub mod conversions { ('\u{1e941}', 125215), ('\u{1e942}', 125216), ('\u{1e943}', 125217), ]; + #[rustfmt::skip] static UPPERCASE_TABLE_MULTI: &[[char; 3]; 102] = &[ ['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'], ['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'], diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 49aef3ec33ec7..2c5918b19188b 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -51,6 +51,7 @@ fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize let mut size = 0; size += size_of_val(mappings.as_slice()); + writeln!(tables, "#[rustfmt::skip]").unwrap(); write!( tables, "static {}CASE_TABLE: &[(char, u32); {}] = &[{}];", @@ -63,6 +64,7 @@ fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize tables.push_str("\n\n"); size += size_of_val(multis.as_slice()); + writeln!(tables, "#[rustfmt::skip]").unwrap(); write!( tables, "static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];", diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 32118fc75281b..418639c562862 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -272,7 +272,6 @@ fn main() { modules.push((String::from("conversions"), conversions)); for (name, contents) in modules { - table_file.push_str("#[rustfmt::skip]\n"); table_file.push_str(&format!("pub mod {name} {{\n")); for line in contents.lines() { if !line.trim().is_empty() { @@ -285,6 +284,11 @@ fn main() { } std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); + rustfmt(&write_location); +} + +fn rustfmt(path: &str) { + std::process::Command::new("rustfmt").arg(path).status().expect("rustfmt failed"); } fn version() -> String { From 33d07c7015486445d7c3ec711a28df5a2b690d52 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sat, 11 Oct 2025 22:50:52 +0100 Subject: [PATCH 3/4] refactor: remove check that `first_code_point` is non-ascii This check was made redundant (it will always be true) when we removed all ASCII characters from the tables (https://github.com/rust-lang/rust/pull/146173/commits/a8c669461f0c71985c72dd5b05f70b8d4d149e3b). --- .../unicode-table-generator/src/skiplist.rs | 33 ++++++++----------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs index 660a8f342f7a7..9b38fd4864f10 100644 --- a/src/tools/unicode-table-generator/src/skiplist.rs +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -88,28 +88,21 @@ impl RawEmitter { // The inlining in this code works like the following: // - // The `skip_search` function is always inlined into the parent `lookup` fn, + // The `skip_search` function is always inlined into the parent `lookup_slow` fn, // thus the compiler can generate optimal code based on the referenced `static`s. // - // In the case of ASCII optimization, the lower-bounds check is inlined into - // the caller, and slower-path `skip_search` is outlined into a separate `lookup_slow` fn. - // - // Thus, in both cases, the `skip_search` function is specialized for the `static`s, - // and outlined into the prebuilt `std`. - if first_code_point > 0x7f { - writeln!(&mut self.file, "#[inline]").unwrap(); - writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); - writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)") - .unwrap(); - writeln!(&mut self.file, "}}").unwrap(); - writeln!(&mut self.file).unwrap(); - writeln!(&mut self.file, "#[inline(never)]").unwrap(); - writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap(); - } else { - writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); - } + // The lower-bounds check is inlined into the caller, and slower-path + // `skip_search` is outlined into a separate `lookup_slow` fn. + assert!(first_code_point > 0x7f); + writeln!(&mut self.file, "#[inline]").unwrap(); + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); + writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)") + .unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + writeln!(&mut self.file).unwrap(); + writeln!(&mut self.file, "#[inline(never)]").unwrap(); + writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap(); writeln!(&mut self.file, " const {{").unwrap(); writeln!( &mut self.file, From 1a646cf2e20f4575aab3e144d50d0a2042a01d68 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sat, 11 Oct 2025 02:03:44 +0100 Subject: [PATCH 4/4] refactor: make string formatting more readable To make the final output code easier to see: * Get rid of the unnecessary line-noise of `.unwrap()`ing calls to `write!()` by moving the `.unwrap()` into a macro. * Join consecutive `write!()` calls using a single multiline format string. * Replace `.push()` and `.push_str(format!())` with `write!()`. * If after doing all of the above, there is only a single `write!()` call in the function, just construct the string directly with `format!()`. --- library/core/src/unicode/unicode_data.rs | 11 +- .../src/cascading_map.rs | 39 +++-- .../src/case_mapping.rs | 67 +++----- .../src/fmt_helpers.rs | 66 ++++++++ src/tools/unicode-table-generator/src/main.rs | 147 +++++++----------- .../src/raw_emitter.rs | 96 ++++-------- .../unicode-table-generator/src/skiplist.rs | 89 ++++------- 7 files changed, 232 insertions(+), 283 deletions(-) create mode 100644 src/tools/unicode-table-generator/src/fmt_helpers.rs diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index dd00c50c6966a..bda8d9d30afce 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -12,6 +12,7 @@ // Total : 31911 bytes pub const UNICODE_VERSION: (u8, u8, u8) = (17, 0, 0); + use super::rt::*; pub mod alphabetic { @@ -129,6 +130,7 @@ pub mod alphabetic { 1, 10, 1, 17, 5, 3, 1, 5, 1, 17, 0, 26, 6, 26, 6, 26, 0, 0, 32, 0, 2, 0, 2, 0, 15, 0, 0, 0, 0, 0, 5, 0, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); @@ -228,6 +230,7 @@ pub mod case_ignorable { 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160, 14, 0, 1, 61, 4, 0, 5, 254, 2, 243, 1, 2, 1, 7, 2, 5, 1, 9, 1, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); @@ -291,6 +294,7 @@ pub mod cased { 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); @@ -381,6 +385,7 @@ pub mod grapheme_extend { 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 100, 1, 160, 7, 0, 1, 61, 4, 0, 4, 254, 2, 243, 1, 2, 1, 7, 2, 5, 1, 0, 7, 109, 7, 0, 96, 128, 240, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); @@ -593,6 +598,7 @@ pub mod n { 50, 0, 10, 0, 10, 0, 10, 247, 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); @@ -741,6 +747,7 @@ pub mod white_space { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; + #[inline] pub const fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); @@ -755,7 +762,7 @@ pub mod white_space { } pub mod conversions { - const INDEX_MASK: u32 = 0x400000; + const INDEX_MASK: u32 = 1 << 22; pub fn to_lower(c: char) -> [char; 3] { if c.is_ascii() { @@ -1152,7 +1159,6 @@ pub mod conversions { ('\u{1e91d}', 125247), ('\u{1e91e}', 125248), ('\u{1e91f}', 125249), ('\u{1e920}', 125250), ('\u{1e921}', 125251), ]; - #[rustfmt::skip] static LOWERCASE_TABLE_MULTI: &[[char; 3]; 1] = &[ ['i', '\u{307}', '\u{0}'], @@ -1538,7 +1544,6 @@ pub mod conversions { ('\u{1e93d}', 125211), ('\u{1e93e}', 125212), ('\u{1e93f}', 125213), ('\u{1e940}', 125214), ('\u{1e941}', 125215), ('\u{1e942}', 125216), ('\u{1e943}', 125217), ]; - #[rustfmt::skip] static UPPERCASE_TABLE_MULTI: &[[char; 3]; 102] = &[ ['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'], diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs index 56e6401908dcf..6ad8b12bc7437 100644 --- a/src/tools/unicode-table-generator/src/cascading_map.rs +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -1,9 +1,8 @@ use std::collections::HashMap; -use std::fmt::Write as _; use std::ops::Range; -use crate::fmt_list; use crate::raw_emitter::RawEmitter; +use crate::writeln; impl RawEmitter { pub fn emit_cascading_map(&mut self, ranges: &[Range]) -> bool { @@ -24,8 +23,6 @@ impl RawEmitter { .flat_map(|r| (r.start..r.end).collect::>()) .collect::>(); - println!("there are {} points", points.len()); - // how many distinct ranges need to be counted? let mut codepoints_by_high_bytes = HashMap::>::new(); for point in points { @@ -37,7 +34,7 @@ impl RawEmitter { } let mut bit_for_high_byte = 1u8; - let mut arms = Vec::::new(); + let mut arms = String::new(); let mut high_bytes: Vec = codepoints_by_high_bytes.keys().copied().collect(); high_bytes.sort(); @@ -45,33 +42,33 @@ impl RawEmitter { let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); if codepoints.len() == 1 { let ch = codepoints.pop().unwrap(); - arms.push(format!("{high_byte} => c as u32 == {ch:#04x}")); + writeln!(arms, "{high_byte} => c as u32 == {ch:#04x},"); continue; } // more than 1 codepoint in this arm for codepoint in codepoints { map[(*codepoint & 0xff) as usize] |= bit_for_high_byte; } - arms.push(format!( - "{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0" - )); + writeln!( + arms, + "{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0," + ); bit_for_high_byte <<= 1; } - writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter())) - .unwrap(); self.bytes_used += 256; + self.file = format!( + "static WHITESPACE_MAP: [u8; 256] = {map:?}; - writeln!(&mut self.file, "#[inline]").unwrap(); - writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); - writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); - for arm in arms { - writeln!(&mut self.file, " {arm},").unwrap(); - } - writeln!(&mut self.file, " _ => false,").unwrap(); - writeln!(&mut self.file, " }}").unwrap(); - writeln!(&mut self.file, "}}").unwrap(); + #[inline] + pub const fn lookup(c: char) -> bool {{ + debug_assert!(!c.is_ascii()); + match c as u32 >> 8 {{ + {arms}\ + _ => false, + }} + }}" + ); true } diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 2c5918b19188b..d634b58b6d4d7 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -1,23 +1,18 @@ use std::char; use std::collections::BTreeMap; -use std::fmt::{self, Write}; -use crate::{UnicodeData, fmt_list}; +use crate::{CharEscape, UnicodeData, fmt_list}; const INDEX_MASK: u32 = 1 << 22; pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) { - let mut file = String::new(); - - write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap(); - file.push_str("\n\n"); - file.push_str(HEADER.trim_start()); - file.push('\n'); let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower); - file.push_str(&lower_tables); - file.push_str("\n\n"); let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper); - file.push_str(&upper_tables); + let file = format!( + "{HEADER} + {lower_tables} + {upper_tables}" + ); (file, [lower_size, upper_size]) } @@ -47,45 +42,23 @@ fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize mappings.push((CharEscape(key), value)); } - let mut tables = String::new(); - let mut size = 0; - - size += size_of_val(mappings.as_slice()); - writeln!(tables, "#[rustfmt::skip]").unwrap(); - write!( - tables, - "static {}CASE_TABLE: &[(char, u32); {}] = &[{}];", - case, - mappings.len(), - fmt_list(mappings), - ) - .unwrap(); - - tables.push_str("\n\n"); - - size += size_of_val(multis.as_slice()); - writeln!(tables, "#[rustfmt::skip]").unwrap(); - write!( - tables, - "static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];", - case, - multis.len(), - fmt_list(multis), - ) - .unwrap(); - - (tables, size) -} - -struct CharEscape(char); - -impl fmt::Debug for CharEscape { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "'{}'", self.0.escape_default()) - } + let size = size_of_val(mappings.as_slice()) + size_of_val(multis.as_slice()); + let file = format!( + " + #[rustfmt::skip]\nstatic {case}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}]; + #[rustfmt::skip]\nstatic {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];", + mappings = fmt_list(&mappings), + mappings_len = mappings.len(), + multis = fmt_list(&multis), + multis_len = multis.len(), + ); + + (file, size) } static HEADER: &str = r" +const INDEX_MASK: u32 = 1 << 22; + pub fn to_lower(c: char) -> [char; 3] { if c.is_ascii() { [(c as u8).to_ascii_lowercase() as char, '\0', '\0'] diff --git a/src/tools/unicode-table-generator/src/fmt_helpers.rs b/src/tools/unicode-table-generator/src/fmt_helpers.rs new file mode 100644 index 0000000000000..68fcbb5c53909 --- /dev/null +++ b/src/tools/unicode-table-generator/src/fmt_helpers.rs @@ -0,0 +1,66 @@ +use std::fmt; + +// Convenience macros for writing and unwrapping. +#[macro_export] +macro_rules! writeln { + ($($args:tt)*) => {{ + use std::fmt::Write as _; + std::writeln!($($args)*).unwrap(); + }}; +} +#[macro_export] +macro_rules! write { + ($($args:tt)*) => {{ + use std::fmt::Write as _; + std::write!($($args)*).unwrap(); + }}; +} + +pub fn fmt_list(values: impl IntoIterator) -> String { + let pieces = values.into_iter().map(|b| format!("{b:?}, ")); + let mut out = String::new(); + let mut line = String::from("\n "); + for piece in pieces { + if line.len() + piece.len() < 98 { + line.push_str(&piece); + } else { + writeln!(out, "{}", line.trim_end()); + line = format!(" {piece}"); + } + } + writeln!(out, "{}", line.trim_end()); + out +} + +/// Wrapper type for formatting a `T` using its `Binary` implementation. +#[derive(Copy, Clone)] +pub struct Bin(pub T); + +impl fmt::Debug for Bin { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let bits = size_of::() * 8; + std::write!(f, "0b{:0bits$b}", self.0) + } +} + +impl fmt::Display for Bin { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// Wrapper type for formatting a `char` using `escape_default`. +#[derive(Copy, Clone)] +pub struct CharEscape(pub char); + +impl fmt::Debug for CharEscape { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + std::write!(f, "'{}'", self.0.escape_default()) + } +} + +impl fmt::Display for CharEscape { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 418639c562862..d30a461dbe8b6 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -72,18 +72,18 @@ //! or not. use std::collections::{BTreeMap, HashMap}; -use std::fmt; -use std::fmt::Write; use std::ops::Range; use ucd_parse::Codepoints; mod cascading_map; mod case_mapping; +mod fmt_helpers; mod raw_emitter; mod skiplist; mod unicode_download; +pub use fmt_helpers::*; use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace}; static PROPERTIES: &[&str] = &[ @@ -224,12 +224,13 @@ fn main() { let ranges_by_property = &unicode_data.ranges; if let Some(path) = test_path { - std::fs::write(&path, generate_tests(&unicode_data).unwrap()).unwrap(); + std::fs::write(&path, generate_tests(&unicode_data)).unwrap(); } let mut table_file = String::new(); - table_file.push_str( - "//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", + writeln!( + table_file, + "//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!", ); let mut total_bytes = 0; @@ -245,8 +246,9 @@ fn main() { } modules.push((property.to_lowercase().to_string(), emitter.file)); - table_file.push_str(&format!( - "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", + writeln!( + table_file, + "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}", property, emitter.bytes_used, datapoints, @@ -254,36 +256,30 @@ fn main() { ranges.first().unwrap().start, ranges.last().unwrap().end, emitter.desc, - )); + ); total_bytes += emitter.bytes_used; } let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); for (name, size) in ["to_lower", "to_upper"].iter().zip(sizes) { - table_file.push_str(&format!("// {:16}: {:5} bytes\n", name, size)); + writeln!(table_file, "// {:16}: {:5} bytes", name, size); total_bytes += size; } - table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); + writeln!(table_file, "// {:16}: {:5} bytes\n", "Total", total_bytes); - table_file.push('\n'); - table_file.push_str(&version()); - table_file.push_str("use super::rt::*;\n"); - table_file.push('\n'); + writeln!(table_file, "{}\n", version()); + writeln!(table_file, "use super::rt::*;\n"); modules.push((String::from("conversions"), conversions)); for (name, contents) in modules { - table_file.push_str(&format!("pub mod {name} {{\n")); - for line in contents.lines() { - if !line.trim().is_empty() { - table_file.push_str(" "); - table_file.push_str(line); - } - table_file.push('\n'); + writeln!(table_file, "pub mod {name} {{"); + for line in contents.trim().lines() { + writeln!(table_file, " {line}"); } - table_file.push_str("}\n\n"); + writeln!(table_file, "}}\n"); } - std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); + std::fs::write(&write_location, table_file).unwrap(); rustfmt(&write_location); } @@ -292,9 +288,6 @@ fn rustfmt(path: &str) { } fn version() -> String { - let mut out = String::new(); - out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = "); - let readme = std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt")) .unwrap(); @@ -306,66 +299,44 @@ fn version() -> String { readme[start..end].split('.').map(|v| v.parse::().expect(v)).collect::>(); let [major, minor, micro] = [version[0], version[1], version[2]]; - out.push_str(&format!("({major}, {minor}, {micro});\n")); - out -} - -fn fmt_list(values: impl IntoIterator) -> String { - let pieces = values.into_iter().map(|b| format!("{b:?}, ")).collect::>(); - let mut out = String::new(); - let mut line = String::from("\n "); - for piece in pieces { - if line.len() + piece.len() < 98 { - line.push_str(&piece); - } else { - out.push_str(line.trim_end()); - out.push('\n'); - line = format!(" {piece}"); - } - } - out.push_str(line.trim_end()); - out.push('\n'); - out + format!("pub const UNICODE_VERSION: (u8, u8, u8) = ({major}, {minor}, {micro});") } -fn generate_tests(data: &UnicodeData) -> Result { - let mut s = String::new(); - writeln!(s, "#![feature(core_intrinsics)]")?; - writeln!(s, "#![allow(internal_features, dead_code)]")?; - writeln!(s, "// ignore-tidy-filelength")?; - writeln!(s, "use std::intrinsics;")?; - writeln!(s, "mod unicode_data;")?; - writeln!(s, "mod rt {{ {} }}", include_str!("../../../../library/core/src/unicode/rt.rs"))?; - writeln!(s, "fn main() {{")?; +fn generate_tests(data: &UnicodeData) -> String { + let mut s = format!( + "#![feature(core_intrinsics)] + #![allow(internal_features, dead_code)] + // ignore-tidy-filelength + use std::intrinsics; + mod unicode_data + fn main() {{" + ); for (property, ranges) in &data.ranges { let prop = property.to_lowercase(); - writeln!(s, r#" println!("Testing {prop}");"#)?; - writeln!(s, " {prop}_true();")?; - writeln!(s, " {prop}_false();")?; let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX) .filter(|c| !c.is_ascii()) .map(u32::from) .partition(|c| ranges.iter().any(|r| r.contains(c))); - writeln!(s, " fn {prop}_true() {{")?; - generate_asserts(&mut s, &prop, &is_true, true)?; - writeln!(s, " }}")?; - - writeln!(s, " fn {prop}_false() {{")?; - generate_asserts(&mut s, &prop, &is_false, false)?; - writeln!(s, " }}")?; + writeln!( + s, + "println!(\"Testing {prop}\"); + {prop}_true(); + {prop}_false(); + fn {prop}_true() {{\n{}\n}} + fn {prop}_false() {{\n{}\n}}", + generate_asserts(&prop, &is_true, true), + generate_asserts(&prop, &is_false, false) + ); } for (name, conversion) in ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper]) { - writeln!(s, r#" println!("Testing {name}");"#)?; + writeln!(s, r#"println!("Testing {name}");"#); for (c, mapping) in conversion { let c = char::from_u32(*c).unwrap(); let mapping = mapping.map(|c| char::from_u32(c).unwrap()); - writeln!( - s, - r#" assert_eq!(unicode_data::conversions::{name}({c:?}), {mapping:?});"# - )?; + writeln!(s, "assert_eq!(unicode_data::conversions::{name}({c:?}), {mapping:?});"); } let unmapped: Vec<_> = (char::MIN..=char::MAX) .filter(|c| !c.is_ascii()) @@ -376,40 +347,36 @@ fn generate_tests(data: &UnicodeData) -> Result { for range in unmapped_ranges { let start = char::from_u32(range.start).unwrap(); let end = char::from_u32(range.end - 1).unwrap(); - writeln!(s, " for c in {start:?}..={end:?} {{")?; writeln!( s, - r#" assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);"# - )?; - - writeln!(s, " }}")?; + r#"for c in {start:?}..={end:?} {{ + assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']); + }}"# + ); } } - writeln!(s, "}}")?; - Ok(s) + writeln!(s, "}}"); + s } -fn generate_asserts( - s: &mut String, - prop: &str, - points: &[u32], - truthy: bool, -) -> Result<(), fmt::Error> { +fn generate_asserts(prop: &str, points: &[u32], truthy: bool) -> String { + let mut s = String::new(); let truthy = if truthy { "" } else { "!" }; for range in ranges_from_set(points) { let start = char::from_u32(range.start).unwrap(); let end = char::from_u32(range.end - 1).unwrap(); match range.len() { - 1 => writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup({start:?}));")?, - _ => { - writeln!(s, " for c in {start:?}..={end:?} {{")?; - writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup(c));")?; - writeln!(s, " }}")?; - } + 1 => writeln!(s, "assert!({truthy}unicode_data::{prop}::lookup({start:?}));"), + _ => writeln!( + s, + "for c in {start:?}..={end:?} {{ + assert!({truthy}unicode_data::{prop}::lookup(c)); + }}" + ), } } - Ok(()) + s } /// Group the elements of `set` into contigous ranges diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 297965615c1a5..048507a06d44f 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -1,8 +1,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; -use std::fmt::{self, Write}; use std::ops::Range; -use crate::fmt_list; +use crate::{Bin, fmt_list, writeln}; #[derive(Clone)] pub struct RawEmitter { @@ -16,13 +15,6 @@ impl RawEmitter { RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() } } - fn blank_line(&mut self) { - if self.file.is_empty() || self.file.ends_with("\n\n") { - return; - } - writeln!(&mut self.file).unwrap(); - } - fn emit_bitset(&mut self, ranges: &[Range]) -> Result<(), String> { let first_code_point = ranges.first().unwrap().start; let last_code_point = ranges.last().unwrap().end; @@ -68,48 +60,33 @@ impl RawEmitter { } self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0); - struct Bits(u64); - impl fmt::Debug for Bits { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "0b{:064b}", self.0) - } - } - - writeln!( - &mut self.file, - "static BITSET_CANONICAL: [u64; {}] = [{}];", - canonicalized.canonical_words.len(), - fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))), - ) - .unwrap(); self.bytes_used += 8 * canonicalized.canonical_words.len(); - writeln!( - &mut self.file, - "static BITSET_MAPPING: [(u8, u8); {}] = [{}];", - canonicalized.canonicalized_words.len(), - fmt_list(&canonicalized.canonicalized_words), - ) - .unwrap(); // 8 bit index into shifted words, 7 bits for shift + optional flip // We only need it for the words that we removed by applying a shift and // flip to them. self.bytes_used += 2 * canonicalized.canonicalized_words.len(); - self.blank_line(); - - writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); - if first_code_point > 0x7f { - writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap(); - } - writeln!(&mut self.file, " super::bitset_search(").unwrap(); - writeln!(&mut self.file, " c as u32,").unwrap(); - writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); - writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); - writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); - writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); - writeln!(&mut self.file, " )").unwrap(); - writeln!(&mut self.file, "}}").unwrap(); + writeln!( + self.file, + "static BITSET_CANONICAL: [u64; {canonical_words_len}] = {canonical_words:?}; + static BITSET_MAPPING: [(u8, u8); {canonicalized_words_len}] = {canonicalized_words:?}; + + pub const fn lookup(c: char) -> bool {{ + debug_assert!(!c.is_ascii()); + (c as u32) >= {first_code_point:#04x} && + super::bitset_search( + c as u32, + &BITSET_CHUNKS_MAP, + &BITSET_INDEX_CHUNKS, + &BITSET_CANONICAL, + &BITSET_MAPPING, + ) + }}", + canonical_words = canonicalized.canonical_words, + canonical_words_len = canonicalized.canonical_words.len(), + canonicalized_words = canonicalized.canonicalized_words, + canonicalized_words_len = canonicalized.canonicalized_words.len(), + ); Ok(()) } @@ -133,29 +110,21 @@ impl RawEmitter { chunk_indices.push(chunk_map[chunk]); } - writeln!( - &mut self.file, - "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];", - chunk_indices.len(), - fmt_list(&chunk_indices), - ) - .unwrap(); self.bytes_used += chunk_indices.len(); writeln!( - &mut self.file, - "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];", - chunk_length, - chunks.len(), - fmt_list(chunks.iter()), - ) - .unwrap(); + self.file, + "static BITSET_CHUNKS_MAP: [u8; {chunk_indices_len}] = {chunk_indices:?}; + static BITSET_INDEX_CHUNKS: [[u8; {chunk_len}]; {chunks_len}] = [{chunks}];", + chunk_indices_len = chunk_indices.len(), + chunk_len = chunk_length, + chunks_len = chunks.len(), + chunks = fmt_list(chunks.iter()), + ); self.bytes_used += chunk_length * chunks.len(); } } pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { - emitter.blank_line(); - let mut bitset = emitter.clone(); let bitset_ok = bitset.emit_bitset(ranges).is_ok(); @@ -172,8 +141,6 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { } pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range]) { - emitter.blank_line(); - let mut cascading = emitter.clone(); cascading.emit_cascading_map(ranges); *emitter = cascading; @@ -181,7 +148,7 @@ pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range]) { } struct Canonicalized { - canonical_words: Vec, + canonical_words: Vec>, canonicalized_words: Vec<(u8, u8)>, /// Maps an input unique word to the associated index (u8) which is into @@ -394,6 +361,7 @@ impl Canonicalized { ) }) .collect::>(); + let canonical_words = canonical_words.into_iter().map(Bin).collect::>(); Canonicalized { unique_mapping, canonical_words, canonicalized_words } } } diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs index 9b38fd4864f10..742d61153db3d 100644 --- a/src/tools/unicode-table-generator/src/skiplist.rs +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -1,8 +1,8 @@ -use std::fmt::{self, Write as _}; +use std::fmt::{self}; use std::ops::Range; -use crate::fmt_list; use crate::raw_emitter::RawEmitter; +use crate::writeln; /// This will get packed into a single u32 before inserting into the data set. #[derive(PartialEq)] @@ -68,22 +68,7 @@ impl RawEmitter { assert!(inserted); } - writeln!(&mut self.file, "use super::ShortOffsetRunHeader;\n").unwrap(); - writeln!( - &mut self.file, - "static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; {}] = [{}];", - short_offset_runs.len(), - fmt_list(short_offset_runs.iter()) - ) - .unwrap(); self.bytes_used += 4 * short_offset_runs.len(); - writeln!( - &mut self.file, - "static OFFSETS: [u8; {}] = [{}];", - coded_offsets.len(), - fmt_list(&coded_offsets) - ) - .unwrap(); self.bytes_used += coded_offsets.len(); // The inlining in this code works like the following: @@ -94,46 +79,34 @@ impl RawEmitter { // The lower-bounds check is inlined into the caller, and slower-path // `skip_search` is outlined into a separate `lookup_slow` fn. assert!(first_code_point > 0x7f); - writeln!(&mut self.file, "#[inline]").unwrap(); - writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); - writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)") - .unwrap(); - writeln!(&mut self.file, "}}").unwrap(); - writeln!(&mut self.file).unwrap(); - writeln!(&mut self.file, "#[inline(never)]").unwrap(); - writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " const {{").unwrap(); - writeln!( - &mut self.file, - " assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);", - ) - .unwrap(); - writeln!(&mut self.file, " let mut i = 0;").unwrap(); - writeln!(&mut self.file, " while i < SHORT_OFFSET_RUNS.len() {{").unwrap(); - writeln!( - &mut self.file, - " assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());", - ) - .unwrap(); - writeln!(&mut self.file, " i += 1;").unwrap(); - writeln!(&mut self.file, " }}").unwrap(); - writeln!(&mut self.file, " }}").unwrap(); - writeln!( - &mut self.file, - " // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`", - ) - .unwrap(); - writeln!( - &mut self.file, - " // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.", - ) - .unwrap(); - writeln!( - &mut self.file, - " unsafe {{ super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }}" - ) - .unwrap(); - writeln!(&mut self.file, "}}").unwrap(); + writeln!(self.file, + "use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; {short_offset_runs_len}] = {short_offset_runs:?}; + static OFFSETS: [u8; {coded_offset_len}] = {coded_offsets:?}; + + #[inline] + pub fn lookup(c: char) -> bool {{ + debug_assert!(!c.is_ascii()); + (c as u32) >= {first_code_point:#04x} && lookup_slow(c) + }} + + #[inline(never)] + fn lookup_slow(c: char) -> bool {{ + const {{ + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() {{ + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + }} + }} + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe {{ super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }} + }}", + short_offset_runs_len = short_offset_runs.len(), + coded_offset_len = coded_offsets.len(), + ); } }