Skip to content

Commit

Permalink
Auto merge of rust-lang#90414 - thomcc:count-chars-faster, r=nagisa
Browse files Browse the repository at this point in the history
Optimize `core::str::Chars::count`

I wrote this a while ago after seeing this function as a bottleneck in a profile, but never got around to contributing it. I saw it again, and so here it is. The implementation is fairly complex, but I tried to explain what's happening at both a high level (in the header comment for the file), and in line comments in the impl. Hopefully it's clear enough.

This implementation (`case00_cur_libcore` in the benchmarks below) is somewhat consistently around 4x to 5x faster than the old implementation (`case01_old_libcore` in the benchmarks below), for a wide variety of workloads, without regressing performance on any of the workload sizes I've tried.

I also improved the benchmarks for this code, so that they explicitly check text in different languages and of different sizes (err, the cross product of language x size). The results of the benchmarks are here:

<details>
<summary>Benchmark results</summary>
<pre>
test str::char_count::emoji_huge::case00_cur_libcore       ... bench:      20,216 ns/iter (+/- 3,673) = 17931 MB/s
test str::char_count::emoji_huge::case01_old_libcore       ... bench:     108,851 ns/iter (+/- 12,777) = 3330 MB/s
test str::char_count::emoji_huge::case02_iter_increment    ... bench:     329,502 ns/iter (+/- 4,163) = 1100 MB/s
test str::char_count::emoji_huge::case03_manual_char_len   ... bench:     223,333 ns/iter (+/- 14,167) = 1623 MB/s
test str::char_count::emoji_large::case00_cur_libcore      ... bench:         293 ns/iter (+/- 6) = 19331 MB/s
test str::char_count::emoji_large::case01_old_libcore      ... bench:       1,681 ns/iter (+/- 28) = 3369 MB/s
test str::char_count::emoji_large::case02_iter_increment   ... bench:       5,166 ns/iter (+/- 85) = 1096 MB/s
test str::char_count::emoji_large::case03_manual_char_len  ... bench:       3,476 ns/iter (+/- 62) = 1629 MB/s
test str::char_count::emoji_medium::case00_cur_libcore     ... bench:          48 ns/iter (+/- 0) = 14750 MB/s
test str::char_count::emoji_medium::case01_old_libcore     ... bench:         217 ns/iter (+/- 4) = 3262 MB/s
test str::char_count::emoji_medium::case02_iter_increment  ... bench:         642 ns/iter (+/- 7) = 1102 MB/s
test str::char_count::emoji_medium::case03_manual_char_len ... bench:         445 ns/iter (+/- 3) = 1591 MB/s
test str::char_count::emoji_small::case00_cur_libcore      ... bench:          18 ns/iter (+/- 0) = 3777 MB/s
test str::char_count::emoji_small::case01_old_libcore      ... bench:          23 ns/iter (+/- 0) = 2956 MB/s
test str::char_count::emoji_small::case02_iter_increment   ... bench:          66 ns/iter (+/- 2) = 1030 MB/s
test str::char_count::emoji_small::case03_manual_char_len  ... bench:          29 ns/iter (+/- 1) = 2344 MB/s
test str::char_count::en_huge::case00_cur_libcore          ... bench:      25,909 ns/iter (+/- 39,260) = 13299 MB/s
test str::char_count::en_huge::case01_old_libcore          ... bench:     102,887 ns/iter (+/- 3,257) = 3349 MB/s
test str::char_count::en_huge::case02_iter_increment       ... bench:     166,370 ns/iter (+/- 12,439) = 2071 MB/s
test str::char_count::en_huge::case03_manual_char_len      ... bench:     166,332 ns/iter (+/- 4,262) = 2071 MB/s
test str::char_count::en_large::case00_cur_libcore         ... bench:         281 ns/iter (+/- 6) = 19160 MB/s
test str::char_count::en_large::case01_old_libcore         ... bench:       1,598 ns/iter (+/- 19) = 3369 MB/s
test str::char_count::en_large::case02_iter_increment      ... bench:       2,598 ns/iter (+/- 167) = 2072 MB/s
test str::char_count::en_large::case03_manual_char_len     ... bench:       2,578 ns/iter (+/- 55) = 2088 MB/s
test str::char_count::en_medium::case00_cur_libcore        ... bench:          44 ns/iter (+/- 1) = 15295 MB/s
test str::char_count::en_medium::case01_old_libcore        ... bench:         201 ns/iter (+/- 51) = 3348 MB/s
test str::char_count::en_medium::case02_iter_increment     ... bench:         322 ns/iter (+/- 40) = 2090 MB/s
test str::char_count::en_medium::case03_manual_char_len    ... bench:         319 ns/iter (+/- 5) = 2109 MB/s
test str::char_count::en_small::case00_cur_libcore         ... bench:          15 ns/iter (+/- 0) = 2333 MB/s
test str::char_count::en_small::case01_old_libcore         ... bench:          14 ns/iter (+/- 0) = 2500 MB/s
test str::char_count::en_small::case02_iter_increment      ... bench:          30 ns/iter (+/- 1) = 1166 MB/s
test str::char_count::en_small::case03_manual_char_len     ... bench:          30 ns/iter (+/- 1) = 1166 MB/s
test str::char_count::ru_huge::case00_cur_libcore          ... bench:      16,439 ns/iter (+/- 3,105) = 19777 MB/s
test str::char_count::ru_huge::case01_old_libcore          ... bench:      89,480 ns/iter (+/- 2,555) = 3633 MB/s
test str::char_count::ru_huge::case02_iter_increment       ... bench:     217,703 ns/iter (+/- 22,185) = 1493 MB/s
test str::char_count::ru_huge::case03_manual_char_len      ... bench:     157,330 ns/iter (+/- 19,188) = 2066 MB/s
test str::char_count::ru_large::case00_cur_libcore         ... bench:         243 ns/iter (+/- 6) = 20905 MB/s
test str::char_count::ru_large::case01_old_libcore         ... bench:       1,384 ns/iter (+/- 51) = 3670 MB/s
test str::char_count::ru_large::case02_iter_increment      ... bench:       3,381 ns/iter (+/- 543) = 1502 MB/s
test str::char_count::ru_large::case03_manual_char_len     ... bench:       2,423 ns/iter (+/- 429) = 2096 MB/s
test str::char_count::ru_medium::case00_cur_libcore        ... bench:          42 ns/iter (+/- 1) = 15119 MB/s
test str::char_count::ru_medium::case01_old_libcore        ... bench:         180 ns/iter (+/- 4) = 3527 MB/s
test str::char_count::ru_medium::case02_iter_increment     ... bench:         402 ns/iter (+/- 45) = 1579 MB/s
test str::char_count::ru_medium::case03_manual_char_len    ... bench:         280 ns/iter (+/- 29) = 2267 MB/s
test str::char_count::ru_small::case00_cur_libcore         ... bench:          12 ns/iter (+/- 0) = 2666 MB/s
test str::char_count::ru_small::case01_old_libcore         ... bench:          12 ns/iter (+/- 0) = 2666 MB/s
test str::char_count::ru_small::case02_iter_increment      ... bench:          19 ns/iter (+/- 0) = 1684 MB/s
test str::char_count::ru_small::case03_manual_char_len     ... bench:          14 ns/iter (+/- 1) = 2285 MB/s
test str::char_count::zh_huge::case00_cur_libcore          ... bench:      15,053 ns/iter (+/- 2,640) = 20067 MB/s
test str::char_count::zh_huge::case01_old_libcore          ... bench:      82,622 ns/iter (+/- 3,602) = 3656 MB/s
test str::char_count::zh_huge::case02_iter_increment       ... bench:     230,456 ns/iter (+/- 7,246) = 1310 MB/s
test str::char_count::zh_huge::case03_manual_char_len      ... bench:     220,595 ns/iter (+/- 11,624) = 1369 MB/s
test str::char_count::zh_large::case00_cur_libcore         ... bench:         227 ns/iter (+/- 65) = 20792 MB/s
test str::char_count::zh_large::case01_old_libcore         ... bench:       1,136 ns/iter (+/- 144) = 4154 MB/s
test str::char_count::zh_large::case02_iter_increment      ... bench:       3,147 ns/iter (+/- 253) = 1499 MB/s
test str::char_count::zh_large::case03_manual_char_len     ... bench:       2,993 ns/iter (+/- 400) = 1577 MB/s
test str::char_count::zh_medium::case00_cur_libcore        ... bench:          36 ns/iter (+/- 5) = 16388 MB/s
test str::char_count::zh_medium::case01_old_libcore        ... bench:         142 ns/iter (+/- 18) = 4154 MB/s
test str::char_count::zh_medium::case02_iter_increment     ... bench:         379 ns/iter (+/- 37) = 1556 MB/s
test str::char_count::zh_medium::case03_manual_char_len    ... bench:         364 ns/iter (+/- 51) = 1620 MB/s
test str::char_count::zh_small::case00_cur_libcore         ... bench:          11 ns/iter (+/- 1) = 3000 MB/s
test str::char_count::zh_small::case01_old_libcore         ... bench:          11 ns/iter (+/- 1) = 3000 MB/s
test str::char_count::zh_small::case02_iter_increment      ... bench:          20 ns/iter (+/- 3) = 1650 MB/s
</pre>
</details>

I also added fairly thorough tests for different sizes and alignments. This completes on my machine in 0.02s, which is surprising given how thorough they are, but it seems to detect bugs in the implementation. (I haven't run the tests on a 32 bit machine yet since before I reworked the code a little though, so... hopefully I'm not about to embarrass myself).

This uses similar SWAR-style techniques to the `is_ascii` impl I contributed in rust-lang#74066, so I'm going to request review from the same person who reviewed that one. That said am not particularly picky, and might not have the correct syntax for requesting a review from someone (so it goes).

r? `@nagisa`
  • Loading branch information
bors committed Feb 6, 2022
2 parents e069a71 + 41f8214 commit f624427
Show file tree
Hide file tree
Showing 7 changed files with 379 additions and 29 deletions.
42 changes: 42 additions & 0 deletions library/alloc/tests/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2230,3 +2230,45 @@ fn utf8_chars() {
assert!((!from_utf8(&[0xf0, 0xff, 0x10]).is_ok()));
assert!((!from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_ok()));
}

#[test]
fn utf8_char_counts() {
let strs = [("e", 1), ("é", 1), ("€", 1), ("\u{10000}", 1), ("eé€\u{10000}", 4)];
let mut reps =
[8, 64, 256, 512, 1024].iter().copied().flat_map(|n| n - 8..=n + 8).collect::<Vec<usize>>();
if cfg!(not(miri)) {
let big = 1 << 16;
reps.extend(big - 8..=big + 8);
}
let counts = if cfg!(miri) { 0..1 } else { 0..8 };
let padding = counts.map(|len| " ".repeat(len)).collect::<Vec<String>>();

for repeat in reps {
for (tmpl_str, tmpl_char_count) in strs {
for pad_start in &padding {
for pad_end in &padding {
// Create a string with padding...
let with_padding =
format!("{}{}{}", pad_start, tmpl_str.repeat(repeat), pad_end);
// ...and then skip past that padding. This should ensure
// that we test several different alignments for both head
// and tail.
let si = pad_start.len();
let ei = with_padding.len() - pad_end.len();
let target = &with_padding[si..ei];

assert!(!target.starts_with(" ") && !target.ends_with(" "));
let expected_count = tmpl_char_count * repeat;
assert_eq!(
expected_count,
target.chars().count(),
"wrong count for `{:?}.repeat({})` (padding: `{:?}`)",
tmpl_str,
repeat,
(pad_start.len(), pad_end.len()),
);
}
}
}
}
}
29 changes: 3 additions & 26 deletions library/core/benches/str.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,10 @@
use std::str;
use test::{black_box, Bencher};

const LOREM_SHORT: &str = "Lorem ipsum";

const LOREM: &str = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.
Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.
Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis.
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, At accusam aliquyam diam diam dolore dolores duo eirmod eos erat, et nonumy sed tempor et et invidunt justo labore Stet clita ea et gubergren, kasd magna no rebum. sanctus sea sed takimata ut vero voluptua. est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur";

const EMOJI: &str = "😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘😗☺😚😙🥲😋😛😜🤪😝🤑🤗🤭🤫🤔🤐🤨😐😑😶😶‍🌫️😏😒🙄😬😮‍💨🤥😌😔😪🤤😴😷🤒🤕🤢🤮🤧🥵🥶🥴😵😵‍💫🤯🤠🥳🥸😎🤓🧐😕😟🙁☹😮😯😲😳🥺😦😧😨😰😥😢😭😱😖😣😞😓😩😫🥱😤😡😠🤬😈👿💀☠💩🤡👹👺👻👽👾🤖😺😸😹😻😼😽🙀😿😾🙈🙉🙊💋💌💘💝💖💗💓💞💕💟❣💔❤️‍🔥❤️‍🩹❤🧡💛💚💙💜🤎🖤🤍💯💢💥💫💦💨🕳💣💬👁️‍🗨️🗨🗯💭💤👋🤚🖐✋🖖👌🤌🤏✌🤞🤟🤘🤙👈👉👆🖕👇☝👍👎✊👊🤛🤜👏🙌👐🤲🤝🙏✍💅🤳💪🦾🦿🦵🦶👂🦻👃🧠🫀🫁🦷🦴👀👁👅👄👶🧒👦👧🧑👱👨🧔🧔‍♂️🧔‍♀️👨‍🦰👨‍🦱👨‍🦳👨‍🦲👩👩‍🦰🧑‍🦰👩‍🦱🧑‍🦱👩‍🦳🧑‍🦳👩‍🦲🧑‍🦲👱‍♀️👱‍♂️🧓👴👵🙍🙍‍♂️🙍‍♀️🙎🙎‍♂️🙎‍♀️🙅🙅‍♂️🙅‍♀️🙆🙆‍♂️🙆‍♀️💁💁‍♂️💁‍♀️🙋🙋‍♂️🙋‍♀️🧏🧏‍♂️🧏‍♀️🙇🙇‍♂️🙇‍♀️🤦🤦‍♂️🤦‍♀️🤷🤷‍♂️🤷‍♀️🧑‍⚕️👨‍⚕️👩‍⚕️🧑‍🎓👨‍🎓👩‍🎓🧑‍🏫👨‍🏫👩‍🏫🧑‍⚖️👨‍⚖️👩‍⚖️🧑‍🌾👨‍🌾👩‍🌾🧑‍🍳👨‍🍳👩‍🍳🧑‍🔧👨‍🔧👩‍🔧🧑‍🏭👨‍🏭👩‍🏭🧑‍💼👨‍💼👩‍💼🧑‍🔬👨‍🔬👩‍🔬🧑‍💻👨‍💻👩‍💻🧑‍🎤👨‍🎤👩‍🎤🧑‍🎨👨‍🎨👩‍🎨🧑‍✈️👨‍✈️👩‍✈️🧑‍🚀👨‍🚀👩‍🚀🧑‍🚒👨‍🚒👩‍🚒👮👮‍♂️👮‍♀️🕵🕵️‍♂️🕵️‍♀️💂💂‍♂️💂‍♀️🥷👷👷‍♂️👷‍♀️🤴👸👳👳‍♂️👳‍♀️👲🧕🤵🤵‍♂️🤵‍♀️👰👰‍♂️👰‍♀️🤰🤱👩‍🍼👨‍🍼🧑‍🍼👼🎅🤶🧑‍🎄🦸🦸‍♂️🦸‍♀️🦹🦹‍♂️🦹‍♀️🧙🧙‍♂️🧙‍♀️🧚🧚‍♂️🧚‍♀️🧛🧛‍♂️🧛‍♀️🧜🧜‍♂️🧜‍♀️🧝🧝‍♂️🧝‍♀️🧞🧞‍♂️🧞‍♀️🧟🧟‍♂️🧟‍♀️💆💆‍♂️💆‍♀️💇💇‍♂️💇‍♀️🚶🚶‍♂️🚶‍♀️🧍🧍‍♂️🧍‍♀️🧎🧎‍♂️🧎‍♀️🧑‍🦯👨‍🦯👩‍🦯🧑‍🦼👨‍🦼👩‍🦼🧑‍🦽👨‍🦽👩‍🦽🏃🏃‍♂️🏃‍♀️💃🕺🕴👯👯‍♂️👯‍♀️🧖🧖‍♂️🧖‍♀️🧗🧗‍♂️🧗‍♀️🤺🏇⛷🏂🏌🏌️‍♂️🏌️‍♀️🏄🏄‍♂️🏄‍♀️🚣🚣‍♂️🚣‍♀️🏊🏊‍♂️🏊‍♀️⛹⛹️‍♂️⛹️‍♀️🏋🏋️‍♂️🏋️‍♀️🚴🚴‍♂️🚴‍♀️🚵🚵‍♂️🚵‍♀️🤸🤸‍♂️🤸‍♀️🤼🤼‍♂️🤼‍♀️🤽🤽‍♂️🤽‍♀️🤾🤾‍♂️🤾‍♀️🤹🤹‍♂️🤹‍♀️🧘🧘‍♂️🧘‍♀️🛀🛌🧑‍🤝‍🧑👭👫👬💏👩‍❤️‍💋‍👨👨‍❤️‍💋‍👨👩‍❤️‍💋‍👩💑👩‍❤️‍👨👨‍❤️‍👨👩‍❤️‍👩👪👨‍👩‍👦👨‍👩‍👧👨‍👩‍👧‍👦👨‍👩‍👦‍👦👨‍👩‍👧‍👧👨‍👨‍👦👨‍👨‍👧👨‍👨‍👧‍👦👨‍👨‍👦‍👦👨‍👨‍👧‍👧👩‍👩‍👦👩‍👩‍👧👩‍👩‍👧‍👦👩‍👩‍👦‍👦👩‍👩‍👧‍👧👨‍👦👨‍👦‍👦👨‍👧👨‍👧‍👦👨‍👧‍👧👩‍👦👩‍👦‍👦👩‍👧👩‍👧‍👦👩‍👧‍👧🗣👤👥🫂👣🦰🦱🦳🦲🐵🐒🦍🦧🐶🐕🦮🐕‍🦺🐩🐺🦊🦝🐱🐈🐈‍⬛🦁🐯🐅🐆🐴🐎🦄🦓🦌🦬🐮🐂🐃🐄🐷🐖🐗🐽🐏🐑🐐🐪🐫🦙🦒🐘🦣🦏🦛🐭🐁🐀🐹🐰🐇🐿🦫🦔🦇🐻🐻‍❄️🐨🐼🦥🦦🦨🦘🦡🐾🦃🐔🐓🐣🐤🐥🐦🐧🕊🦅🦆🦢🦉🦤🪶🦩🦚🦜🐸🐊🐢🦎🐍🐲🐉🦕🦖🐳🐋🐬🦭🐟🐠🐡🦈🐙🐚🐌🦋🐛🐜🐝🪲🐞🦗🪳🕷🕸🦂🦟🪰🪱🦠💐🌸💮🏵🌹🥀🌺🌻🌼🌷🌱🪴🌲🌳🌴🌵🌾🌿☘🍀🍁🍂🍃🍇🍈🍉🍊🍋🍌🍍🥭🍎🍏🍐🍑🍒🍓🫐🥝🍅🫒🥥🥑🍆🥔🥕🌽🌶🫑🥒🥬🥦🧄🧅🍄🥜🌰🍞🥐🥖🫓🥨🥯🥞🧇🧀🍖🍗🥩🥓🍔🍟🍕🌭🥪🌮🌯🫔🥙🧆🥚🍳🥘🍲🫕🥣🥗🍿🧈🧂🥫🍱🍘🍙🍚🍛🍜🍝🍠🍢🍣🍤🍥🥮🍡🥟🥠🥡🦀🦞🦐🦑🦪🍦🍧🍨🍩🍪🎂🍰🧁🥧🍫🍬🍭🍮🍯🍼🥛☕🫖🍵🍶🍾🍷🍸🍹🍺🍻🥂🥃🥤🧋🧃🧉🧊🥢🍽🍴🥄🔪🏺🌍🌎🌏🌐🗺🗾🧭🏔⛰🌋🗻🏕🏖🏜🏝🏞🏟🏛🏗🧱🪨🪵🛖🏘🏚🏠🏡🏢🏣🏤🏥🏦🏨🏩🏪🏫🏬🏭🏯🏰💒🗼🗽⛪🕌🛕🕍⛩🕋⛲⛺🌁🌃🏙🌄🌅🌆🌇🌉♨🎠🎡🎢💈🎪🚂🚃🚄🚅🚆🚇🚈🚉🚊🚝🚞🚋🚌🚍🚎🚐🚑🚒🚓🚔🚕🚖🚗🚘🚙🛻🚚🚛🚜🏎🏍🛵🦽🦼🛺🚲🛴🛹🛼🚏🛣🛤🛢⛽🚨🚥🚦🛑🚧⚓⛵🛶🚤🛳⛴🛥🚢✈🛩🛫🛬🪂💺🚁🚟🚠🚡🛰🚀🛸🛎🧳⌛⏳⌚⏰⏱⏲🕰🕛🕧🕐🕜🕑🕝🕒🕞🕓🕟🕔🕠🕕🕡🕖🕢🕗🕣🕘🕤🕙🕥🕚🕦🌑🌒🌓🌔🌕🌖🌗🌘🌙🌚🌛🌜🌡☀🌝🌞🪐⭐🌟🌠🌌☁⛅⛈🌤🌥🌦🌧🌨🌩🌪🌫🌬🌀🌈🌂☂☔⛱⚡❄☃⛄☄🔥💧🌊🎃🎄🎆🎇🧨✨🎈🎉🎊🎋🎍🎎🎏🎐🎑🧧🎀🎁🎗🎟🎫🎖🏆🏅🥇🥈🥉⚽⚾🥎🏀🏐🏈🏉🎾🥏🎳🏏🏑🏒🥍🏓🏸🥊🥋🥅⛳⛸🎣🤿🎽🎿🛷🥌🎯🪀🪁🎱🔮🪄🧿🎮🕹🎰🎲🧩🧸🪅🪆♠♥♦♣♟🃏🀄🎴🎭🖼🎨🧵🪡🧶🪢👓🕶🥽🥼🦺👔👕👖🧣🧤🧥🧦👗👘🥻🩱🩲🩳👙👚👛👜👝🛍🎒🩴👞👟🥾🥿👠👡🩰👢👑👒🎩🎓🧢🪖⛑📿💄💍💎🔇🔈🔉🔊📢📣📯🔔🔕🎼🎵🎶🎙🎚🎛🎤🎧📻🎷🪗🎸🎹🎺🎻🪕🥁";

#[bench]
fn str_char_count_lorem(b: &mut Bencher) {
b.iter(|| black_box(LOREM).chars().count());
}

#[bench]
fn str_char_count_lorem_short(b: &mut Bencher) {
b.iter(|| black_box(LOREM_SHORT).chars().count());
}

#[bench]
fn str_char_count_emoji(b: &mut Bencher) {
b.iter(|| black_box(EMOJI).chars().count());
}
mod char_count;
mod corpora;

#[bench]
fn str_validate_emoji(b: &mut Bencher) {
b.iter(|| str::from_utf8(black_box(EMOJI.as_bytes())));
b.iter(|| str::from_utf8(black_box(corpora::emoji::LARGE.as_bytes())));
}
107 changes: 107 additions & 0 deletions library/core/benches/str/char_count.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
use super::corpora::*;
use test::{black_box, Bencher};

macro_rules! define_benches {
($( fn $name: ident($arg: ident: &str) $body: block )+) => {
define_benches!(mod en_tiny, en::TINY, $($name $arg $body)+);
define_benches!(mod en_small, en::SMALL, $($name $arg $body)+);
define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+);
define_benches!(mod en_large, en::LARGE, $($name $arg $body)+);
define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+);

define_benches!(mod zh_tiny, zh::TINY, $($name $arg $body)+);
define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+);
define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+);
define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+);
define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+);

define_benches!(mod ru_tiny, ru::TINY, $($name $arg $body)+);
define_benches!(mod ru_small, ru::SMALL, $($name $arg $body)+);
define_benches!(mod ru_medium, ru::MEDIUM, $($name $arg $body)+);
define_benches!(mod ru_large, ru::LARGE, $($name $arg $body)+);
define_benches!(mod ru_huge, ru::HUGE, $($name $arg $body)+);

define_benches!(mod emoji_tiny, emoji::TINY, $($name $arg $body)+);
define_benches!(mod emoji_small, emoji::SMALL, $($name $arg $body)+);
define_benches!(mod emoji_medium, emoji::MEDIUM, $($name $arg $body)+);
define_benches!(mod emoji_large, emoji::LARGE, $($name $arg $body)+);
define_benches!(mod emoji_huge, emoji::HUGE, $($name $arg $body)+);
};
(mod $mod_name: ident, $input: expr, $($name: ident $arg: ident $body: block)+) => {
mod $mod_name {
use super::*;
$(
#[bench]
fn $name(bencher: &mut Bencher) {
let input = $input;
bencher.bytes = input.len() as u64;
let mut input_s = input.to_string();
bencher.iter(|| {
let $arg: &str = &black_box(&mut input_s);
black_box($body)
})
}
)+
}
};
}

define_benches! {
fn case00_libcore(s: &str) {
libcore(s)
}

fn case01_filter_count_cont_bytes(s: &str) {
filter_count_cont_bytes(s)
}

fn case02_iter_increment(s: &str) {
iterator_increment(s)
}

fn case03_manual_char_len(s: &str) {
manual_char_len(s)
}
}

fn libcore(s: &str) -> usize {
s.chars().count()
}

#[inline]
fn utf8_is_cont_byte(byte: u8) -> bool {
(byte as i8) < -64
}

fn filter_count_cont_bytes(s: &str) -> usize {
s.as_bytes().iter().filter(|&&byte| !utf8_is_cont_byte(byte)).count()
}

fn iterator_increment(s: &str) -> usize {
let mut c = 0;
for _ in s.chars() {
c += 1;
}
c
}

fn manual_char_len(s: &str) -> usize {
let s = s.as_bytes();
let mut c = 0;
let mut i = 0;
let l = s.len();
while i < l {
let b = s[i];
if b < 0x80 {
i += 1;
} else if b < 0xe0 {
i += 2;
} else if b < 0xf0 {
i += 3;
} else {
i += 4;
}
c += 1;
}
c
}
88 changes: 88 additions & 0 deletions library/core/benches/str/corpora.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
//! Exposes a number of modules with different kinds of strings.
//!
//! Each module contains `&str` constants named `TINY`, `SMALL`, `MEDIUM`,
//! `LARGE`, and `HUGE`.
//!
//! - The `TINY` string is generally around 8 bytes.
//! - The `SMALL` string is generally around 30-40 bytes.
//! - The `MEDIUM` string is generally around 600-700 bytes.
//! - The `LARGE` string is the `MEDIUM` string repeated 8x, and is around 5kb.
//! - The `HUGE` string is the `LARGE` string repeated 8x (or the `MEDIUM`
//! string repeated 64x), and is around 40kb.
//!
//! Except for `mod emoji` (which is just a bunch of emoji), the strings were
//! pulled from (localizations of) rust-lang.org.

macro_rules! repeat8 {
($s:expr) => {
concat!($s, $s, $s, $s, $s, $s, $s, $s)
};
}

macro_rules! define_consts {
($s:literal) => {
pub const MEDIUM: &str = $s;
pub const LARGE: &str = repeat8!($s);
pub const HUGE: &str = repeat8!(repeat8!(repeat8!($s)));
};
}

pub mod en {
pub const TINY: &str = "Mary had";
pub const SMALL: &str = "Mary had a little lamb, Little lamb";
define_consts! {
"Rust is blazingly fast and memory-efficient: with no runtime or garbage
collector, it can power performance-critical services, run on embedded
devices, and easily integrate with other languages. Rust’s rich type system
and ownership model guarantee memory-safety and thread-safety — enabling you
to eliminate many classes of bugs at compile-time. Rust has great
documentation, a friendly compiler with useful error messages, and top-notch
tooling — an integrated package manager and build tool, smart multi-editor
support with auto-completion and type inspections, an auto-formatter, and
more."
}
}

pub mod zh {
pub const TINY: &str = "速度惊";
pub const SMALL: &str = "速度惊人且内存利用率极高";
define_consts! {
"Rust 速度惊人且内存利用率极高。由于\
没有运行时和垃圾回收,它能够胜任对性能要\
求特别高的服务,可以在嵌入式设备上运行,\
还能轻松和其他语言集成。Rust 丰富的类型\
系统和所有权模型保证了内存安全和线程安全,\
让您在编译期就能够消除各种各样的错误。\
Rust 拥有出色的文档、友好的编译器和清晰\
的错误提示信息, 还集成了一流的工具——\
包管理器和构建工具, 智能地自动补全和类\
型检验的多编辑器支持, 以及自动格式化代\
码等等。"
}
}

pub mod ru {
pub const TINY: &str = "Сотни";
pub const SMALL: &str = "Сотни компаний по";
define_consts! {
"Сотни компаний по всему миру используют Rust в реальных\
проектах для быстрых кросс-платформенных решений с\
ограниченными ресурсами. Такие проекты, как Firefox,\
Dropbox и Cloudflare, используют Rust. Rust отлично\
подходит как для стартапов, так и для больших компаний,\
как для встраиваемых устройств, так и для масштабируемых\
web-сервисов. Мой самый большой комплимент Rust."
}
}

pub mod emoji {
pub const TINY: &str = "😀😃";
pub const SMALL: &str = "😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘";
define_consts! {
"😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘😗☺😚😙🥲😋😛😜🤪😝🤑🤗🤭🤫🤔🤐🤨😐😑😶😶‍🌫️😏😒\
🙄😬😮‍💨🤥😌😔😪🤤😴😷🤒🤕🤢🤮🤧🥵🥶🥴😵😵‍💫🤯��🥳🥸😎🤓🧐😕😟🙁☹😮😯😲😳🥺😦😧😨\
😰😥😢😭😱😖😣😞😓😩😫🥱😤😡😠🤬😈👿💀☠💩🤡👹👺👻👽👾🤖😺😸😹😻😼😽🙀😿😾🙈🙉🙊\
💋💌💘💝💖💗💓��💕💟❣💔❤️‍🔥❤️‍🩹❤🧡💛💚💙💜🤎🖤🤍💯💢💥💫💦💨🕳💬👁️‍🗨️🗨🗯💭💤👋\
🤚🖐✋🖖👌🤌🤏✌"
}
}
Loading

0 comments on commit f624427

Please sign in to comment.