Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions library/core/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;

pub(crate) mod printable;

mod rt;
#[allow(unreachable_pub)]
mod unicode_data;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Runtime support for `unicode_data`.

#[inline(always)]
const fn bitset_search<
pub(super) const fn bitset_search<
const N: usize,
const CHUNK_SIZE: usize,
const N1: usize,
Expand Down Expand Up @@ -46,23 +48,23 @@ const fn bitset_search<
}

#[repr(transparent)]
struct ShortOffsetRunHeader(u32);
pub(super) struct ShortOffsetRunHeader(pub(super) u32);

impl ShortOffsetRunHeader {
const fn new(start_index: usize, prefix_sum: u32) -> Self {
pub(super) const fn new(start_index: usize, prefix_sum: u32) -> Self {
assert!(start_index < (1 << 11));
assert!(prefix_sum < (1 << 21));

Self((start_index as u32) << 21 | prefix_sum)
}

#[inline]
const fn start_index(&self) -> usize {
pub(super) const fn start_index(&self) -> usize {
(self.0 >> 21) as usize
}

#[inline]
const fn prefix_sum(&self) -> u32 {
pub(super) const fn prefix_sum(&self) -> u32 {
self.0 & ((1 << 21) - 1)
}
}
Expand All @@ -72,7 +74,7 @@ impl ShortOffsetRunHeader {
/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
/// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
#[inline(always)]
unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
needle: char,
short_offset_runs: &[ShortOffsetRunHeader; SOR],
offsets: &[u8; OFFSETS],
Expand Down
818 changes: 414 additions & 404 deletions library/core/src/unicode/unicode_data.rs

Large diffs are not rendered by default.

39 changes: 18 additions & 21 deletions src/tools/unicode-table-generator/src/cascading_map.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
use std::collections::HashMap;
use std::fmt::Write as _;
use std::ops::Range;

use crate::fmt_list;
use crate::raw_emitter::RawEmitter;
use crate::writeln;

impl RawEmitter {
pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
Expand All @@ -24,8 +23,6 @@ impl RawEmitter {
.flat_map(|r| (r.start..r.end).collect::<Vec<u32>>())
.collect::<Vec<u32>>();

println!("there are {} points", points.len());

// how many distinct ranges need to be counted?
let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
for point in points {
Expand All @@ -37,41 +34,41 @@ impl RawEmitter {
}

let mut bit_for_high_byte = 1u8;
let mut arms = Vec::<String>::new();
let mut arms = String::new();

let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().copied().collect();
high_bytes.sort();
for high_byte in high_bytes {
let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
if codepoints.len() == 1 {
let ch = codepoints.pop().unwrap();
arms.push(format!("{high_byte} => c as u32 == {ch:#04x}"));
writeln!(arms, "{high_byte} => c as u32 == {ch:#04x},");
continue;
}
// more than 1 codepoint in this arm
for codepoint in codepoints {
map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
}
arms.push(format!(
"{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0"
));
writeln!(
arms,
"{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0,"
);
bit_for_high_byte <<= 1;
}

writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
.unwrap();
self.bytes_used += 256;
self.file = format!(
"static WHITESPACE_MAP: [u8; 256] = {map:?};
writeln!(&mut self.file, "#[inline]").unwrap();
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
for arm in arms {
writeln!(&mut self.file, " {arm},").unwrap();
}
writeln!(&mut self.file, " _ => false,").unwrap();
writeln!(&mut self.file, " }}").unwrap();
writeln!(&mut self.file, "}}").unwrap();
#[inline]
pub const fn lookup(c: char) -> bool {{
debug_assert!(!c.is_ascii());
match c as u32 >> 8 {{
{arms}\
_ => false,
}}
}}"
);

true
}
Expand Down
65 changes: 20 additions & 45 deletions src/tools/unicode-table-generator/src/case_mapping.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
use std::char;
use std::collections::BTreeMap;
use std::fmt::{self, Write};

use crate::{UnicodeData, fmt_list};
use crate::{CharEscape, UnicodeData, fmt_list};

const INDEX_MASK: u32 = 1 << 22;

pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
let mut file = String::new();

write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
file.push_str("\n\n");
file.push_str(HEADER.trim_start());
file.push('\n');
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
file.push_str(&lower_tables);
file.push_str("\n\n");
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
file.push_str(&upper_tables);
let file = format!(
"{HEADER}
{lower_tables}
{upper_tables}"
);
(file, [lower_size, upper_size])
}

Expand Down Expand Up @@ -47,43 +42,23 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize
mappings.push((CharEscape(key), value));
}

let mut tables = String::new();
let mut size = 0;

size += size_of_val(mappings.as_slice());
write!(
tables,
"static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
case,
mappings.len(),
fmt_list(mappings),
)
.unwrap();

tables.push_str("\n\n");

size += size_of_val(multis.as_slice());
write!(
tables,
"static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
case,
multis.len(),
fmt_list(multis),
)
.unwrap();

(tables, size)
}

struct CharEscape(char);

impl fmt::Debug for CharEscape {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "'{}'", self.0.escape_default())
}
let size = size_of_val(mappings.as_slice()) + size_of_val(multis.as_slice());
let file = format!(
"
#[rustfmt::skip]\nstatic {case}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}];
#[rustfmt::skip]\nstatic {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];",
mappings = fmt_list(&mappings),
mappings_len = mappings.len(),
multis = fmt_list(&multis),
multis_len = multis.len(),
);

(file, size)
}

static HEADER: &str = r"
const INDEX_MASK: u32 = 1 << 22;

pub fn to_lower(c: char) -> [char; 3] {
if c.is_ascii() {
[(c as u8).to_ascii_lowercase() as char, '\0', '\0']
Expand Down
66 changes: 66 additions & 0 deletions src/tools/unicode-table-generator/src/fmt_helpers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use std::fmt;

// Convenience macros for writing and unwrapping.
#[macro_export]
macro_rules! writeln {
($($args:tt)*) => {{
use std::fmt::Write as _;
std::writeln!($($args)*).unwrap();
}};
}
#[macro_export]
macro_rules! write {
($($args:tt)*) => {{
use std::fmt::Write as _;
std::write!($($args)*).unwrap();
}};
}

pub fn fmt_list<V: fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
let pieces = values.into_iter().map(|b| format!("{b:?}, "));
let mut out = String::new();
let mut line = String::from("\n ");
for piece in pieces {
if line.len() + piece.len() < 98 {
line.push_str(&piece);
} else {
writeln!(out, "{}", line.trim_end());
line = format!(" {piece}");
}
}
writeln!(out, "{}", line.trim_end());
out
}

/// Wrapper type for formatting a `T` using its `Binary` implementation.
#[derive(Copy, Clone)]
pub struct Bin<T>(pub T);

impl<T: fmt::Binary> fmt::Debug for Bin<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let bits = size_of::<T>() * 8;
std::write!(f, "0b{:0bits$b}", self.0)
}
}

impl<T: fmt::Binary> fmt::Display for Bin<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self, f)
}
}

/// Wrapper type for formatting a `char` using `escape_default`.
#[derive(Copy, Clone)]
pub struct CharEscape(pub char);

impl fmt::Debug for CharEscape {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
std::write!(f, "'{}'", self.0.escape_default())
}
}

impl fmt::Display for CharEscape {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self, f)
}
}
Loading
Loading