Skip to content

Commit 8858dcd

Browse files
committed
Move more code out of unicode-table-generator into core
1 parent d85276b commit 8858dcd

File tree

4 files changed

+91
-147
lines changed

4 files changed

+91
-147
lines changed

library/core/src/unicode/rt.rs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,34 @@ impl ShortOffsetRunHeader {
6969
}
7070
}
7171

72+
/// Combination of constant-time verification + unsafe call.
73+
// FIXME(const-hack): this should really just make the tables generic instead of using a macro to
74+
// combine these two; that way, the constant verification can be folded into the function.
75+
// but that requires ADT const params, and it felt better to avoid that for now
76+
pub macro skip_search($needle:expr, $short_offset_runs:expr, $offsets:expr $(,)?) {{
77+
const {
78+
$crate::unicode::rt::assert_skip_search_valid($short_offset_runs, $offsets);
79+
}
80+
81+
// SAFETY: We verify the precondition above.
82+
unsafe { $crate::unicode::rt::skip_search($needle, $short_offset_runs, $offsets) }
83+
}}
84+
85+
/// Constant-time verification of [`skip_search`].
86+
pub(super) const fn assert_skip_search_valid(
87+
short_offset_runs: &[ShortOffsetRunHeader],
88+
offsets: &[u8],
89+
) {
90+
assert!(short_offset_runs.last().unwrap().0 > char::MAX as u32);
91+
92+
// FIXME(const-hack): const Iterator
93+
let mut i = 0;
94+
while i < short_offset_runs.len() {
95+
assert!(short_offset_runs[i].start_index() < offsets.len());
96+
i += 1;
97+
}
98+
}
99+
72100
/// # Safety
73101
///
74102
/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
@@ -129,6 +157,33 @@ pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
129157
offset_idx % 2 == 1
130158
}
131159

160+
/// Combination of constant-time verification + unsafe call.
161+
// FIXME(const-hack): same as skip_search docs
162+
pub macro case_conversion($c:expr, $ascii_fn:expr, $table:expr, $multi:expr $(,)?) {{
163+
const {
164+
$crate::unicode::rt::assert_case_conversion_valid($table, $multi);
165+
}
166+
167+
// SAFETY: We verify the precondition above.
168+
unsafe { $crate::unicode::rt::case_conversion($c, $ascii_fn, $table, $multi) }
169+
}}
170+
171+
/// Constant-time verification of [`case_conversion`].
172+
pub(super) const fn assert_case_conversion_valid(table: &[(char, u32)], multi: &[[char; 3]]) {
173+
// FIXME(const-hack): const Iterator
174+
let mut i = 0;
175+
while i < table.len() {
176+
let (_, val) = table[i];
177+
if val & (1 << 22) == 0 {
178+
assert!(char::from_u32(val).is_some());
179+
} else {
180+
let index = val & ((1 << 22) - 1);
181+
assert!((index as usize) < multi.len());
182+
}
183+
i += 1;
184+
}
185+
}
186+
132187
/// # Safety
133188
/// The second component of each tuple in `table` must either be:
134189
/// - A valid `char`

library/core/src/unicode/unicode_data.rs

Lines changed: 27 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pub const UNICODE_VERSION: (u8, u8, u8) = (17, 0, 0);
1818
pub mod alphabetic {
1919
use super::ShortOffsetRunHeader;
2020

21-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 51] = [
21+
static SHORT_OFFSET_RUNS: &[ShortOffsetRunHeader; 51] = &[
2222
ShortOffsetRunHeader::new(0, 706),
2323
ShortOffsetRunHeader::new(12, 4681),
2424
ShortOffsetRunHeader::new(414, 5741),
@@ -71,7 +71,7 @@ pub mod alphabetic {
7171
ShortOffsetRunHeader::new(1516, 210042),
7272
ShortOffsetRunHeader::new(1518, 1324154),
7373
];
74-
static OFFSETS: [u8; 1519] = [
74+
static OFFSETS: &[u8; 1519] = &[
7575
170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 0, 4, 12, 14, 5, 7, 1, 1, 1, 86, 1, 29, 18, 1, 2, 2,
7676
4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 2, 1, 6, 41, 39, 14, 1, 1,
7777
1, 2, 1, 2, 1, 1, 8, 27, 4, 4, 29, 11, 5, 56, 1, 7, 14, 102, 1, 8, 4, 8, 4, 3, 10, 3, 2, 1,
@@ -139,24 +139,14 @@ pub mod alphabetic {
139139

140140
#[inline(never)]
141141
fn lookup_slow(c: char) -> bool {
142-
const {
143-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
144-
let mut i = 0;
145-
while i < SHORT_OFFSET_RUNS.len() {
146-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
147-
i += 1;
148-
}
149-
}
150-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
151-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
152-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
142+
super::skip_search!(c, SHORT_OFFSET_RUNS, OFFSETS);
153143
}
154144
}
155145

156146
pub mod case_ignorable {
157147
use super::ShortOffsetRunHeader;
158148

159-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 36] = [
149+
static SHORT_OFFSET_RUNS: &[ShortOffsetRunHeader; 36] = &[
160150
ShortOffsetRunHeader::new(0, 688),
161151
ShortOffsetRunHeader::new(11, 4957),
162152
ShortOffsetRunHeader::new(263, 5906),
@@ -194,7 +184,7 @@ pub mod case_ignorable {
194184
ShortOffsetRunHeader::new(911, 917505),
195185
ShortOffsetRunHeader::new(913, 2032112),
196186
];
197-
static OFFSETS: [u8; 919] = [
187+
static OFFSETS: &[u8; 919] = &[
198188
168, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, 1, 1, 251, 7, 207, 1, 5, 1, 49,
199189
45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, 1, 10, 21, 16, 1, 101, 8, 1,
200190
10, 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, 24, 43, 3, 44, 1, 7, 2, 5,
@@ -239,24 +229,14 @@ pub mod case_ignorable {
239229

240230
#[inline(never)]
241231
fn lookup_slow(c: char) -> bool {
242-
const {
243-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
244-
let mut i = 0;
245-
while i < SHORT_OFFSET_RUNS.len() {
246-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
247-
i += 1;
248-
}
249-
}
250-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
251-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
252-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
232+
super::skip_search!(c, SHORT_OFFSET_RUNS, OFFSETS);
253233
}
254234
}
255235

256236
pub mod cased {
257237
use super::ShortOffsetRunHeader;
258238

259-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [
239+
static SHORT_OFFSET_RUNS: &[ShortOffsetRunHeader; 22] = &[
260240
ShortOffsetRunHeader::new(0, 4256),
261241
ShortOffsetRunHeader::new(51, 5024),
262242
ShortOffsetRunHeader::new(61, 7296),
@@ -280,7 +260,7 @@ pub mod cased {
280260
ShortOffsetRunHeader::new(305, 127280),
281261
ShortOffsetRunHeader::new(307, 1241482),
282262
];
283-
static OFFSETS: [u8; 313] = [
263+
static OFFSETS: &[u8; 313] = &[
284264
170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 2, 35, 7, 2, 30, 5, 96, 1, 42, 4,
285265
2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1,
286266
1, 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6,
@@ -303,24 +283,14 @@ pub mod cased {
303283

304284
#[inline(never)]
305285
fn lookup_slow(c: char) -> bool {
306-
const {
307-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
308-
let mut i = 0;
309-
while i < SHORT_OFFSET_RUNS.len() {
310-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
311-
i += 1;
312-
}
313-
}
314-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
315-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
316-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
286+
super::skip_search!(c, SHORT_OFFSET_RUNS, OFFSETS);
317287
}
318288
}
319289

320290
pub mod grapheme_extend {
321291
use super::ShortOffsetRunHeader;
322292

323-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 33] = [
293+
static SHORT_OFFSET_RUNS: &[ShortOffsetRunHeader; 33] = &[
324294
ShortOffsetRunHeader::new(0, 768),
325295
ShortOffsetRunHeader::new(1, 1155),
326296
ShortOffsetRunHeader::new(3, 1425),
@@ -355,7 +325,7 @@ pub mod grapheme_extend {
355325
ShortOffsetRunHeader::new(759, 917536),
356326
ShortOffsetRunHeader::new(763, 2032112),
357327
];
358-
static OFFSETS: [u8; 767] = [
328+
static OFFSETS: &[u8; 767] = &[
359329
0, 112, 0, 7, 0, 45, 1, 1, 1, 2, 1, 2, 1, 1, 72, 11, 48, 21, 16, 1, 101, 7, 2, 6, 2, 2, 1,
360330
4, 35, 1, 30, 27, 91, 11, 58, 9, 9, 1, 24, 4, 1, 9, 1, 3, 1, 5, 43, 3, 59, 9, 42, 24, 1,
361331
32, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 29, 1, 58, 1, 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 26,
@@ -394,17 +364,7 @@ pub mod grapheme_extend {
394364

395365
#[inline(never)]
396366
fn lookup_slow(c: char) -> bool {
397-
const {
398-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
399-
let mut i = 0;
400-
while i < SHORT_OFFSET_RUNS.len() {
401-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
402-
i += 1;
403-
}
404-
}
405-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
406-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
407-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
367+
super::skip_search!(c, SHORT_OFFSET_RUNS, OFFSETS);
408368
}
409369
}
410370

@@ -538,7 +498,7 @@ pub mod lowercase {
538498
pub mod n {
539499
use super::ShortOffsetRunHeader;
540500

541-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 43] = [
501+
static SHORT_OFFSET_RUNS: &[ShortOffsetRunHeader; 43] = &[
542502
ShortOffsetRunHeader::new(0, 1632),
543503
ShortOffsetRunHeader::new(7, 2406),
544504
ShortOffsetRunHeader::new(13, 4160),
@@ -583,7 +543,7 @@ pub mod n {
583543
ShortOffsetRunHeader::new(287, 130032),
584544
ShortOffsetRunHeader::new(289, 1244154),
585545
];
586-
static OFFSETS: [u8; 291] = [
546+
static OFFSETS: &[u8; 291] = &[
587547
178, 2, 5, 1, 2, 3, 0, 10, 134, 10, 198, 10, 0, 10, 118, 10, 4, 6, 108, 10, 118, 10, 118,
588548
10, 2, 6, 110, 13, 115, 10, 8, 7, 103, 10, 104, 7, 7, 19, 109, 10, 96, 10, 118, 10, 70, 20,
589549
0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, 182,
@@ -607,17 +567,7 @@ pub mod n {
607567

608568
#[inline(never)]
609569
fn lookup_slow(c: char) -> bool {
610-
const {
611-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
612-
let mut i = 0;
613-
while i < SHORT_OFFSET_RUNS.len() {
614-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
615-
i += 1;
616-
}
617-
}
618-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
619-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
620-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
570+
super::skip_search!(c, SHORT_OFFSET_RUNS, OFFSETS);
621571
}
622572
}
623573

@@ -1163,29 +1113,12 @@ pub mod conversions {
11631113

11641114
#[inline]
11651115
pub fn to_lower(c: char) -> [char; 3] {
1166-
const {
1167-
let mut i = 0;
1168-
while i < LOWERCASE_TABLE.len() {
1169-
let (_, val) = LOWERCASE_TABLE[i];
1170-
if val & (1 << 22) == 0 {
1171-
assert!(char::from_u32(val).is_some());
1172-
} else {
1173-
let index = val & ((1 << 22) - 1);
1174-
assert!((index as usize) < LOWERCASE_TABLE_MULTI.len());
1175-
}
1176-
i += 1;
1177-
}
1178-
}
1179-
1180-
// SAFETY: Just checked that the tables are valid
1181-
unsafe {
1182-
super::case_conversion(
1183-
c,
1184-
|c| c.to_ascii_lowercase(),
1185-
LOWERCASE_TABLE,
1186-
LOWERCASE_TABLE_MULTI,
1187-
)
1188-
}
1116+
super::case_conversion!(
1117+
c,
1118+
|c| c.to_ascii_lowercase(),
1119+
LOWERCASE_TABLE,
1120+
LOWERCASE_TABLE_MULTI,
1121+
)
11891122
}
11901123

11911124
#[rustfmt::skip]
@@ -1668,28 +1601,11 @@ pub mod conversions {
16681601

16691602
#[inline]
16701603
pub fn to_upper(c: char) -> [char; 3] {
1671-
const {
1672-
let mut i = 0;
1673-
while i < UPPERCASE_TABLE.len() {
1674-
let (_, val) = UPPERCASE_TABLE[i];
1675-
if val & (1 << 22) == 0 {
1676-
assert!(char::from_u32(val).is_some());
1677-
} else {
1678-
let index = val & ((1 << 22) - 1);
1679-
assert!((index as usize) < UPPERCASE_TABLE_MULTI.len());
1680-
}
1681-
i += 1;
1682-
}
1683-
}
1684-
1685-
// SAFETY: Just checked that the tables are valid
1686-
unsafe {
1687-
super::case_conversion(
1688-
c,
1689-
|c| c.to_ascii_uppercase(),
1690-
UPPERCASE_TABLE,
1691-
UPPERCASE_TABLE_MULTI,
1692-
)
1693-
}
1604+
super::case_conversion!(
1605+
c,
1606+
|c| c.to_ascii_uppercase(),
1607+
UPPERCASE_TABLE,
1608+
UPPERCASE_TABLE_MULTI,
1609+
)
16941610
}
16951611
}

src/tools/unicode-table-generator/src/case_mapping.rs

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -56,29 +56,12 @@ static {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];
5656
5757
#[inline]
5858
pub fn to_{case_lower}(c: char) -> [char; 3] {{
59-
const {{
60-
let mut i = 0;
61-
while i < {case_upper}CASE_TABLE.len() {{
62-
let (_, val) = {case_upper}CASE_TABLE[i];
63-
if val & (1 << 22) == 0 {{
64-
assert!(char::from_u32(val).is_some());
65-
}} else {{
66-
let index = val & ((1 << 22) - 1);
67-
assert!((index as usize) < {case_upper}CASE_TABLE_MULTI.len());
68-
}}
69-
i += 1;
70-
}}
71-
}}
72-
73-
// SAFETY: Just checked that the tables are valid
74-
unsafe {{
75-
super::case_conversion(
76-
c,
77-
|c| c.to_ascii_{case_lower}case(),
78-
{case_upper}CASE_TABLE,
79-
{case_upper}CASE_TABLE_MULTI,
80-
)
81-
}}
59+
super::case_conversion!(
60+
c,
61+
|c| c.to_ascii_{case_lower}case(),
62+
{case_upper}CASE_TABLE,
63+
{case_upper}CASE_TABLE_MULTI,
64+
)
8265
}}",
8366
mappings = fmt_list(&mappings),
8467
mappings_len = mappings.len(),

src/tools/unicode-table-generator/src/skiplist.rs

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@ impl RawEmitter {
8282
writeln!(self.file,
8383
"use super::ShortOffsetRunHeader;
8484
85-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; {short_offset_runs_len}] = {short_offset_runs:?};
86-
static OFFSETS: [u8; {coded_offset_len}] = {coded_offsets:?};
85+
static SHORT_OFFSET_RUNS: &[ShortOffsetRunHeader; {short_offset_runs_len}] = &{short_offset_runs:?};
86+
static OFFSETS: &[u8; {coded_offset_len}] = &{coded_offsets:?};
8787
8888
#[inline]
8989
pub fn lookup(c: char) -> bool {{
@@ -93,17 +93,7 @@ impl RawEmitter {
9393
9494
#[inline(never)]
9595
fn lookup_slow(c: char) -> bool {{
96-
const {{
97-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
98-
let mut i = 0;
99-
while i < SHORT_OFFSET_RUNS.len() {{
100-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
101-
i += 1;
102-
}}
103-
}}
104-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
105-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
106-
unsafe {{ super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }}
96+
super::skip_search!(c, SHORT_OFFSET_RUNS, OFFSETS);
10797
}}",
10898
short_offset_runs_len = short_offset_runs.len(),
10999
coded_offset_len = coded_offsets.len(),

0 commit comments

Comments
 (0)