Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions library/core/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;

pub(crate) mod printable;

mod rt;
#[allow(unreachable_pub)]
pub mod unicode_data;
mod unicode_data;

/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
/// `char` and `str` methods are based on.
Expand Down
2,537 changes: 1,215 additions & 1,322 deletions library/core/src/unicode/unicode_data.rs

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion library/coretests/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@
#![feature(try_find)]
#![feature(try_trait_v2)]
#![feature(uint_bit_width)]
#![feature(unicode_internals)]
#![feature(unsize)]
#![feature(unwrap_infallible)]
// tidy-alphabetical-end
Expand Down
96 changes: 0 additions & 96 deletions library/coretests/tests/unicode.rs
Original file line number Diff line number Diff line change
@@ -1,101 +1,5 @@
use core::unicode::unicode_data;
use std::ops::RangeInclusive;

mod test_data;

#[test]
pub fn version() {
let (major, _minor, _update) = core::char::UNICODE_VERSION;
assert!(major >= 10);
}

#[track_caller]
fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
let mut start = '\u{80}';
for range in ranges {
for c in start..*range.start() {
assert!(!lookup(c), "{c:?}");
}
for c in range.clone() {
assert!(lookup(c), "{c:?}");
}
start = char::from_u32(*range.end() as u32 + 1).unwrap();
}
for c in start..=char::MAX {
assert!(!lookup(c), "{c:?}");
}
}

#[track_caller]
fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
let mut start = '\u{80}';
for &(key, val) in ranges {
for c in start..key {
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
}
assert_eq!(lookup(key), val, "{key:?}");
start = char::from_u32(key as u32 + 1).unwrap();
}
for c in start..=char::MAX {
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
}
}

#[test]
#[cfg_attr(miri, ignore)]
fn alphabetic() {
test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn case_ignorable() {
test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn cased() {
test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn grapheme_extend() {
test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn lowercase() {
test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
}

#[test]
fn n() {
test_boolean_property(test_data::N, unicode_data::n::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn uppercase() {
test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn white_space() {
test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn to_lowercase() {
test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
}

#[test]
#[cfg_attr(miri, ignore)]
fn to_uppercase() {
test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
}
2,928 changes: 0 additions & 2,928 deletions library/coretests/tests/unicode/test_data.rs

This file was deleted.

1 change: 0 additions & 1 deletion src/bootstrap/src/core/build_steps/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,6 @@ impl Step for UnicodeTableGenerator {
fn run(self, builder: &Builder<'_>) {
let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);
cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));
cmd.arg(builder.src.join("library/coretests/tests/unicode/test_data.rs"));
cmd.run(builder);
}
}
Expand Down
39 changes: 21 additions & 18 deletions src/tools/unicode-table-generator/src/cascading_map.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use std::collections::HashMap;
use std::fmt::Write as _;
use std::ops::Range;

use crate::fmt_list;
use crate::raw_emitter::RawEmitter;
use crate::writeln;

impl RawEmitter {
pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
Expand All @@ -23,6 +24,8 @@ impl RawEmitter {
.flat_map(|r| (r.start..r.end).collect::<Vec<u32>>())
.collect::<Vec<u32>>();

println!("there are {} points", points.len());

// how many distinct ranges need to be counted?
let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
for point in points {
Expand All @@ -34,41 +37,41 @@ impl RawEmitter {
}

let mut bit_for_high_byte = 1u8;
let mut arms = String::new();
let mut arms = Vec::<String>::new();

let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().copied().collect();
high_bytes.sort();
for high_byte in high_bytes {
let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
if codepoints.len() == 1 {
let ch = codepoints.pop().unwrap();
writeln!(arms, "{high_byte:#04x} => c as u32 == {ch:#04x},");
arms.push(format!("{high_byte} => c as u32 == {ch:#04x}"));
continue;
}
// more than 1 codepoint in this arm
for codepoint in codepoints {
map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
}
writeln!(
arms,
"{high_byte:#04x} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0,"
);
arms.push(format!(
"{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0"
));
bit_for_high_byte <<= 1;
}

writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
.unwrap();
self.bytes_used += 256;
self.file = format!(
"static WHITESPACE_MAP: [u8; 256] = {map:?};

#[inline]
pub const fn lookup(c: char) -> bool {{
debug_assert!(!c.is_ascii());
match c as u32 >> 8 {{
{arms}\
_ => false,
}}
}}"
);
writeln!(&mut self.file, "#[inline]").unwrap();
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
for arm in arms {
writeln!(&mut self.file, " {arm},").unwrap();
}
writeln!(&mut self.file, " _ => false,").unwrap();
writeln!(&mut self.file, " }}").unwrap();
writeln!(&mut self.file, "}}").unwrap();

true
}
Expand Down
134 changes: 82 additions & 52 deletions src/tools/unicode-table-generator/src/case_mapping.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
use std::char;
use std::collections::BTreeMap;
use std::fmt::{self, Write};

use crate::fmt_helpers::Hex;
use crate::{CharEscape, UnicodeData, fmt_list};
use crate::{UnicodeData, fmt_list};

const INDEX_MASK: u32 = 1 << 22;

pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
let mut file = String::new();

write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
file.push_str("\n\n");
file.push_str(HEADER.trim_start());
file.push('\n');
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
file.push_str(&lower_tables);
file.push_str("\n\n");
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
let file = format!(
"{lower_tables}
{upper_tables}"
);
file.push_str(&upper_tables);
(file, [lower_size, upper_size])
}

fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
let case_lower = case.to_lowercase();
let case_upper = case.to_uppercase();

let mut mappings = Vec::with_capacity(data.len());
let mut multis = Vec::new();

Expand All @@ -42,49 +44,77 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize
INDEX_MASK | (u32::try_from(multis.len()).unwrap() - 1)
};

mappings.push((CharEscape(key), Hex(value)));
mappings.push((CharEscape(key), value));
}

let mut tables = String::new();
let mut size = 0;

size += size_of_val(mappings.as_slice());
write!(
tables,
"static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
case,
mappings.len(),
fmt_list(mappings),
)
.unwrap();

tables.push_str("\n\n");

size += size_of_val(multis.as_slice());
write!(
tables,
"static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
case,
multis.len(),
fmt_list(multis),
)
.unwrap();

(tables, size)
}

struct CharEscape(char);

impl fmt::Debug for CharEscape {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "'{}'", self.0.escape_default())
}
}

static HEADER: &str = r"
pub fn to_lower(c: char) -> [char; 3] {
if c.is_ascii() {
[(c as u8).to_ascii_lowercase() as char, '\0', '\0']
} else {
LOWERCASE_TABLE
.binary_search_by(|&(key, _)| key.cmp(&c))
.map(|i| {
let u = LOWERCASE_TABLE[i].1;
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
// SAFETY: Index comes from statically generated table
unsafe { *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
})
})
.unwrap_or([c, '\0', '\0'])
}
}

let size = size_of_val(mappings.as_slice()) + size_of_val(multis.as_slice());
let file = format!(
"
#[rustfmt::skip]
static {case}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}];

#[rustfmt::skip]
static {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];

#[inline]
pub fn to_{case_lower}(c: char) -> [char; 3] {{
const {{
let mut i = 0;
while i < {case_upper}CASE_TABLE.len() {{
let (_, val) = {case_upper}CASE_TABLE[i];
if val & (1 << 22) == 0 {{
assert!(char::from_u32(val).is_some());
}} else {{
let index = val & ((1 << 22) - 1);
assert!((index as usize) < {case_upper}CASE_TABLE_MULTI.len());
}}
i += 1;
}}
}}

// SAFETY: Just checked that the tables are valid
unsafe {{
super::case_conversion(
c,
|c| c.to_ascii_{case_lower}case(),
{case_upper}CASE_TABLE,
{case_upper}CASE_TABLE_MULTI,
)
}}
}}",
mappings = fmt_list(&mappings),
mappings_len = mappings.len(),
multis = fmt_list(&multis),
multis_len = multis.len(),
);

(file, size)
pub fn to_upper(c: char) -> [char; 3] {
if c.is_ascii() {
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
} else {
UPPERCASE_TABLE
.binary_search_by(|&(key, _)| key.cmp(&c))
.map(|i| {
let u = UPPERCASE_TABLE[i].1;
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
// SAFETY: Index comes from statically generated table
unsafe { *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
})
})
.unwrap_or([c, '\0', '\0'])
}
}
";
Loading
Loading