rust-lang · bors · Nov 3, 2025 · Nov 3, 2025
diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs
@@ -18,9 +18,8 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;
 
 pub(crate) mod printable;
 
-mod rt;
 #[allow(unreachable_pub)]
-pub mod unicode_data;
+mod unicode_data;
 
 /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
 /// `char` and `str` methods are based on.

diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs
diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs
@@ -116,7 +116,6 @@
 #![feature(try_find)]
 #![feature(try_trait_v2)]
 #![feature(uint_bit_width)]
-#![feature(unicode_internals)]
 #![feature(unsize)]
 #![feature(unwrap_infallible)]
 // tidy-alphabetical-end

diff --git a/library/coretests/tests/unicode.rs b/library/coretests/tests/unicode.rs
@@ -1,101 +1,5 @@
-use core::unicode::unicode_data;
-use std::ops::RangeInclusive;
-
-mod test_data;
-
 #[test]
 pub fn version() {
     let (major, _minor, _update) = core::char::UNICODE_VERSION;
     assert!(major >= 10);
 }
-
-#[track_caller]
-fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
-    let mut start = '\u{80}';
-    for range in ranges {
-        for c in start..*range.start() {
-            assert!(!lookup(c), "{c:?}");
-        }
-        for c in range.clone() {
-            assert!(lookup(c), "{c:?}");
-        }
-        start = char::from_u32(*range.end() as u32 + 1).unwrap();
-    }
-    for c in start..=char::MAX {
-        assert!(!lookup(c), "{c:?}");
-    }
-}
-
-#[track_caller]
-fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
-    let mut start = '\u{80}';
-    for &(key, val) in ranges {
-        for c in start..key {
-            assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
-        }
-        assert_eq!(lookup(key), val, "{key:?}");
-        start = char::from_u32(key as u32 + 1).unwrap();
-    }
-    for c in start..=char::MAX {
-        assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
-    }
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn alphabetic() {
-    test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn case_ignorable() {
-    test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn cased() {
-    test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn grapheme_extend() {
-    test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn lowercase() {
-    test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
-}
-
-#[test]
-fn n() {
-    test_boolean_property(test_data::N, unicode_data::n::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn uppercase() {
-    test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn white_space() {
-    test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn to_lowercase() {
-    test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn to_uppercase() {
-    test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
-}
diff --git a/library/coretests/tests/unicode/test_data.rs b/library/coretests/tests/unicode/test_data.rs
diff --git a/src/bootstrap/src/core/build_steps/run.rs b/src/bootstrap/src/core/build_steps/run.rs
@@ -374,7 +374,6 @@ impl Step for UnicodeTableGenerator {
     fn run(self, builder: &Builder<'_>) {
         let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);
         cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));
-        cmd.arg(builder.src.join("library/coretests/tests/unicode/test_data.rs"));
         cmd.run(builder);
     }
 }

diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs
@@ -1,8 +1,9 @@
 use std::collections::HashMap;
+use std::fmt::Write as _;
 use std::ops::Range;
 
+use crate::fmt_list;
 use crate::raw_emitter::RawEmitter;
-use crate::writeln;
 
 impl RawEmitter {
     pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
@@ -23,6 +24,8 @@ impl RawEmitter {
             .flat_map(|r| (r.start..r.end).collect::<Vec<u32>>())
             .collect::<Vec<u32>>();
 
+        println!("there are {} points", points.len());
+
         // how many distinct ranges need to be counted?
         let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
         for point in points {
@@ -34,41 +37,41 @@ impl RawEmitter {
         }
 
         let mut bit_for_high_byte = 1u8;
-        let mut arms = String::new();
+        let mut arms = Vec::<String>::new();
 
         let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().copied().collect();
         high_bytes.sort();
         for high_byte in high_bytes {
             let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
             if codepoints.len() == 1 {
                 let ch = codepoints.pop().unwrap();
-                writeln!(arms, "{high_byte:#04x} => c as u32 == {ch:#04x},");
+                arms.push(format!("{high_byte} => c as u32 == {ch:#04x}"));
                 continue;
             }
             // more than 1 codepoint in this arm
             for codepoint in codepoints {
                 map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
             }
-            writeln!(
-                arms,
-                "{high_byte:#04x} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0,"
-            );
+            arms.push(format!(
+                "{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0"
+            ));
             bit_for_high_byte <<= 1;
         }
 
+        writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
+            .unwrap();
         self.bytes_used += 256;
-        self.file = format!(
-            "static WHITESPACE_MAP: [u8; 256] = {map:?};
 
-            #[inline]
-            pub const fn lookup(c: char) -> bool {{
-                debug_assert!(!c.is_ascii());
-                match c as u32 >> 8 {{
-                    {arms}\
-                    _ => false,
-                }}
-            }}"
-        );
+        writeln!(&mut self.file, "#[inline]").unwrap();
+        writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
+        writeln!(&mut self.file, "    match c as u32 >> 8 {{").unwrap();
+        for arm in arms {
+            writeln!(&mut self.file, "        {arm},").unwrap();
+        }
+        writeln!(&mut self.file, "        _ => false,").unwrap();
+        writeln!(&mut self.file, "    }}").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();
 
         true
     }

diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs
@@ -1,25 +1,27 @@
 use std::char;
 use std::collections::BTreeMap;
+use std::fmt::{self, Write};
 
-use crate::fmt_helpers::Hex;
-use crate::{CharEscape, UnicodeData, fmt_list};
+use crate::{UnicodeData, fmt_list};
 
 const INDEX_MASK: u32 = 1 << 22;
 
 pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
+    let mut file = String::new();
+
+    write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
+    file.push_str("\n\n");
+    file.push_str(HEADER.trim_start());
+    file.push('\n');
     let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
+    file.push_str(&lower_tables);
+    file.push_str("\n\n");
     let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
-    let file = format!(
-        "{lower_tables}
-        {upper_tables}"
-    );
+    file.push_str(&upper_tables);
     (file, [lower_size, upper_size])
 }
 
 fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
-    let case_lower = case.to_lowercase();
-    let case_upper = case.to_uppercase();
-
     let mut mappings = Vec::with_capacity(data.len());
     let mut multis = Vec::new();
 
@@ -42,49 +44,77 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize
             INDEX_MASK | (u32::try_from(multis.len()).unwrap() - 1)
         };
 
-        mappings.push((CharEscape(key), Hex(value)));
+        mappings.push((CharEscape(key), value));
+    }
+
+    let mut tables = String::new();
+    let mut size = 0;
+
+    size += size_of_val(mappings.as_slice());
+    write!(
+        tables,
+        "static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
+        case,
+        mappings.len(),
+        fmt_list(mappings),
+    )
+    .unwrap();
+
+    tables.push_str("\n\n");
+
+    size += size_of_val(multis.as_slice());
+    write!(
+        tables,
+        "static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
+        case,
+        multis.len(),
+        fmt_list(multis),
+    )
+    .unwrap();
+
+    (tables, size)
+}
+
+struct CharEscape(char);
+
+impl fmt::Debug for CharEscape {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "'{}'", self.0.escape_default())
+    }
+}
+
+static HEADER: &str = r"
+pub fn to_lower(c: char) -> [char; 3] {
+    if c.is_ascii() {
+        [(c as u8).to_ascii_lowercase() as char, '\0', '\0']
+    } else {
+        LOWERCASE_TABLE
+            .binary_search_by(|&(key, _)| key.cmp(&c))
+            .map(|i| {
+                let u = LOWERCASE_TABLE[i].1;
+                char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
+                    // SAFETY: Index comes from statically generated table
+                    unsafe { *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
+                })
+            })
+            .unwrap_or([c, '\0', '\0'])
     }
+}
 
-    let size = size_of_val(mappings.as_slice()) + size_of_val(multis.as_slice());
-    let file = format!(
-        "
-#[rustfmt::skip]
-static {case}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}];
-
-#[rustfmt::skip]
-static {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];
-
-#[inline]
-pub fn to_{case_lower}(c: char) -> [char; 3] {{
-    const {{
-        let mut i = 0;
-        while i < {case_upper}CASE_TABLE.len() {{
-            let (_, val) = {case_upper}CASE_TABLE[i];
-            if val & (1 << 22) == 0 {{
-                assert!(char::from_u32(val).is_some());
-            }} else {{
-                let index = val & ((1 << 22) - 1);
-                assert!((index as usize) < {case_upper}CASE_TABLE_MULTI.len());
-            }}
-            i += 1;
-        }}
-    }}
-
-    // SAFETY: Just checked that the tables are valid
-    unsafe {{
-        super::case_conversion(
-            c,
-            |c| c.to_ascii_{case_lower}case(),
-            {case_upper}CASE_TABLE,
-            {case_upper}CASE_TABLE_MULTI,
-        )
-    }}
-}}",
-        mappings = fmt_list(&mappings),
-        mappings_len = mappings.len(),
-        multis = fmt_list(&multis),
-        multis_len = multis.len(),
-    );
-
-    (file, size)
+pub fn to_upper(c: char) -> [char; 3] {
+    if c.is_ascii() {
+        [(c as u8).to_ascii_uppercase() as char, '\0', '\0']
+    } else {
+        UPPERCASE_TABLE
+            .binary_search_by(|&(key, _)| key.cmp(&c))
+            .map(|i| {
+                let u = UPPERCASE_TABLE[i].1;
+                char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
+                    // SAFETY: Index comes from statically generated table
+                    unsafe { *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
+                })
+            })
+            .unwrap_or([c, '\0', '\0'])
+    }
 }
+";