From 628b2173265e19cb5d22b86349618b0ab88cf5c2 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Sat, 30 Oct 2021 03:47:47 -0700 Subject: [PATCH 1/5] Optimize `core::str::Chars::count` --- library/alloc/tests/str.rs | 40 +++++++++ library/core/benches/str.rs | 29 +------ library/core/benches/str/char_count.rs | 101 +++++++++++++++++++++ library/core/benches/str/corpora.rs | 83 ++++++++++++++++++ library/core/src/str/count.rs | 116 +++++++++++++++++++++++++ library/core/src/str/iter.rs | 5 +- library/core/src/str/mod.rs | 1 + 7 files changed, 346 insertions(+), 29 deletions(-) create mode 100644 library/core/benches/str/char_count.rs create mode 100644 library/core/benches/str/corpora.rs create mode 100644 library/core/src/str/count.rs diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index e92881b104928..3dcbc54be4e8d 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -2230,3 +2230,43 @@ fn utf8_chars() { assert!((!from_utf8(&[0xf0, 0xff, 0x10]).is_ok())); assert!((!from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_ok())); } + +#[test] +fn utf8_char_counts() { + let strs = [("e", 1), ("รฉ", 1), ("โ‚ฌ", 1), ("\u{10000}", 1), ("eรฉโ‚ฌ\u{10000}", 4)]; + let mut reps = vec![1, 8, 64, 256, 512, 1024]; + if cfg!(not(miri)) { + reps.push(1 << 16); + } + let counts = if cfg!(miri) { 0..1 } else { 0..8 }; + let padding = counts.map(|len| " ".repeat(len)).collect::>(); + + for repeat in reps { + for (tmpl_str, tmpl_char_count) in strs { + for pad_start in &padding { + for pad_end in &padding { + // Create a string with padding... + let with_padding = + format!("{}{}{}", pad_start, tmpl_str.repeat(repeat), pad_end); + // ...and then skip past that padding. This should ensure + // that we test several different alignments for both head + // and tail. + let si = pad_start.len(); + let ei = with_padding.len() - pad_end.len(); + let target = &with_padding[si..ei]; + + assert!(!target.starts_with(" ") && !target.ends_with(" ")); + let expected_count = tmpl_char_count * repeat; + assert_eq!( + expected_count, + target.chars().count(), + "wrong count for `{:?}.repeat({})` (padding: `{:?}`)", + tmpl_str, + repeat, + (pad_start.len(), pad_end.len()), + ); + } + } + } + } +} diff --git a/library/core/benches/str.rs b/library/core/benches/str.rs index 1527aa0bd6640..78865d81fb90c 100644 --- a/library/core/benches/str.rs +++ b/library/core/benches/str.rs @@ -1,33 +1,10 @@ use std::str; use test::{black_box, Bencher}; -const LOREM_SHORT: &str = "Lorem ipsum"; - -const LOREM: &str = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. -Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. -Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. -Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. -Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis. -At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, At accusam aliquyam diam diam dolore dolores duo eirmod eos erat, et nonumy sed tempor et et invidunt justo labore Stet clita ea et gubergren, kasd magna no rebum. sanctus sea sed takimata ut vero voluptua. est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur"; - -const EMOJI: &str = "๐Ÿ˜€๐Ÿ˜ƒ๐Ÿ˜„๐Ÿ˜๐Ÿ˜†๐Ÿ˜…๐Ÿคฃ๐Ÿ˜‚๐Ÿ™‚๐Ÿ™ƒ๐Ÿ˜‰๐Ÿ˜Š๐Ÿ˜‡๐Ÿฅฐ๐Ÿ˜๐Ÿคฉ๐Ÿ˜˜๐Ÿ˜—โ˜บ๐Ÿ˜š๐Ÿ˜™๐Ÿฅฒ๐Ÿ˜‹๐Ÿ˜›๐Ÿ˜œ๐Ÿคช๐Ÿ˜๐Ÿค‘๐Ÿค—๐Ÿคญ๐Ÿคซ๐Ÿค”๐Ÿค๐Ÿคจ๐Ÿ˜๐Ÿ˜‘๐Ÿ˜ถ๐Ÿ˜ถโ€๐ŸŒซ๏ธ๐Ÿ˜๐Ÿ˜’๐Ÿ™„๐Ÿ˜ฌ๐Ÿ˜ฎโ€๐Ÿ’จ๐Ÿคฅ๐Ÿ˜Œ๐Ÿ˜”๐Ÿ˜ช๐Ÿคค๐Ÿ˜ด๐Ÿ˜ท๐Ÿค’๐Ÿค•๐Ÿคข๐Ÿคฎ๐Ÿคง๐Ÿฅต๐Ÿฅถ๐Ÿฅด๐Ÿ˜ต๐Ÿ˜ตโ€๐Ÿ’ซ๐Ÿคฏ๐Ÿค ๐Ÿฅณ๐Ÿฅธ๐Ÿ˜Ž๐Ÿค“๐Ÿง๐Ÿ˜•๐Ÿ˜Ÿ๐Ÿ™โ˜น๐Ÿ˜ฎ๐Ÿ˜ฏ๐Ÿ˜ฒ๐Ÿ˜ณ๐Ÿฅบ๐Ÿ˜ฆ๐Ÿ˜ง๐Ÿ˜จ๐Ÿ˜ฐ๐Ÿ˜ฅ๐Ÿ˜ข๐Ÿ˜ญ๐Ÿ˜ฑ๐Ÿ˜–๐Ÿ˜ฃ๐Ÿ˜ž๐Ÿ˜“๐Ÿ˜ฉ๐Ÿ˜ซ๐Ÿฅฑ๐Ÿ˜ค๐Ÿ˜ก๐Ÿ˜ ๐Ÿคฌ๐Ÿ˜ˆ๐Ÿ‘ฟ๐Ÿ’€โ˜ ๐Ÿ’ฉ๐Ÿคก๐Ÿ‘น๐Ÿ‘บ๐Ÿ‘ป๐Ÿ‘ฝ๐Ÿ‘พ๐Ÿค–๐Ÿ˜บ๐Ÿ˜ธ๐Ÿ˜น๐Ÿ˜ป๐Ÿ˜ผ๐Ÿ˜ฝ๐Ÿ™€๐Ÿ˜ฟ๐Ÿ˜พ๐Ÿ™ˆ๐Ÿ™‰๐Ÿ™Š๐Ÿ’‹๐Ÿ’Œ๐Ÿ’˜๐Ÿ’๐Ÿ’–๐Ÿ’—๐Ÿ’“๐Ÿ’ž๐Ÿ’•๐Ÿ’Ÿโฃ๐Ÿ’”โค๏ธโ€๐Ÿ”ฅโค๏ธโ€๐Ÿฉนโค๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ๐ŸคŽ๐Ÿ–ค๐Ÿค๐Ÿ’ฏ๐Ÿ’ข๐Ÿ’ฅ๐Ÿ’ซ๐Ÿ’ฆ๐Ÿ’จ๐Ÿ•ณ๐Ÿ’ฃ๐Ÿ’ฌ๐Ÿ‘๏ธโ€๐Ÿ—จ๏ธ๐Ÿ—จ๐Ÿ—ฏ๐Ÿ’ญ๐Ÿ’ค๐Ÿ‘‹๐Ÿคš๐Ÿ–โœ‹๐Ÿ––๐Ÿ‘Œ๐ŸคŒ๐ŸคโœŒ๐Ÿคž๐ŸคŸ๐Ÿค˜๐Ÿค™๐Ÿ‘ˆ๐Ÿ‘‰๐Ÿ‘†๐Ÿ–•๐Ÿ‘‡โ˜๐Ÿ‘๐Ÿ‘ŽโœŠ๐Ÿ‘Š๐Ÿค›๐Ÿคœ๐Ÿ‘๐Ÿ™Œ๐Ÿ‘๐Ÿคฒ๐Ÿค๐Ÿ™โœ๐Ÿ’…๐Ÿคณ๐Ÿ’ช๐Ÿฆพ๐Ÿฆฟ๐Ÿฆต๐Ÿฆถ๐Ÿ‘‚๐Ÿฆป๐Ÿ‘ƒ๐Ÿง ๐Ÿซ€๐Ÿซ๐Ÿฆท๐Ÿฆด๐Ÿ‘€๐Ÿ‘๐Ÿ‘…๐Ÿ‘„๐Ÿ‘ถ๐Ÿง’๐Ÿ‘ฆ๐Ÿ‘ง๐Ÿง‘๐Ÿ‘ฑ๐Ÿ‘จ๐Ÿง”๐Ÿง”โ€โ™‚๏ธ๐Ÿง”โ€โ™€๏ธ๐Ÿ‘จโ€๐Ÿฆฐ๐Ÿ‘จโ€๐Ÿฆฑ๐Ÿ‘จโ€๐Ÿฆณ๐Ÿ‘จโ€๐Ÿฆฒ๐Ÿ‘ฉ๐Ÿ‘ฉโ€๐Ÿฆฐ๐Ÿง‘โ€๐Ÿฆฐ๐Ÿ‘ฉโ€๐Ÿฆฑ๐Ÿง‘โ€๐Ÿฆฑ๐Ÿ‘ฉโ€๐Ÿฆณ๐Ÿง‘โ€๐Ÿฆณ๐Ÿ‘ฉโ€๐Ÿฆฒ๐Ÿง‘โ€๐Ÿฆฒ๐Ÿ‘ฑโ€โ™€๏ธ๐Ÿ‘ฑโ€โ™‚๏ธ๐Ÿง“๐Ÿ‘ด๐Ÿ‘ต๐Ÿ™๐Ÿ™โ€โ™‚๏ธ๐Ÿ™โ€โ™€๏ธ๐Ÿ™Ž๐Ÿ™Žโ€โ™‚๏ธ๐Ÿ™Žโ€โ™€๏ธ๐Ÿ™…๐Ÿ™…โ€โ™‚๏ธ๐Ÿ™…โ€โ™€๏ธ๐Ÿ™†๐Ÿ™†โ€โ™‚๏ธ๐Ÿ™†โ€โ™€๏ธ๐Ÿ’๐Ÿ’โ€โ™‚๏ธ๐Ÿ’โ€โ™€๏ธ๐Ÿ™‹๐Ÿ™‹โ€โ™‚๏ธ๐Ÿ™‹โ€โ™€๏ธ๐Ÿง๐Ÿงโ€โ™‚๏ธ๐Ÿงโ€โ™€๏ธ๐Ÿ™‡๐Ÿ™‡โ€โ™‚๏ธ๐Ÿ™‡โ€โ™€๏ธ๐Ÿคฆ๐Ÿคฆโ€โ™‚๏ธ๐Ÿคฆโ€โ™€๏ธ๐Ÿคท๐Ÿคทโ€โ™‚๏ธ๐Ÿคทโ€โ™€๏ธ๐Ÿง‘โ€โš•๏ธ๐Ÿ‘จโ€โš•๏ธ๐Ÿ‘ฉโ€โš•๏ธ๐Ÿง‘โ€๐ŸŽ“๐Ÿ‘จโ€๐ŸŽ“๐Ÿ‘ฉโ€๐ŸŽ“๐Ÿง‘โ€๐Ÿซ๐Ÿ‘จโ€๐Ÿซ๐Ÿ‘ฉโ€๐Ÿซ๐Ÿง‘โ€โš–๏ธ๐Ÿ‘จโ€โš–๏ธ๐Ÿ‘ฉโ€โš–๏ธ๐Ÿง‘โ€๐ŸŒพ๐Ÿ‘จโ€๐ŸŒพ๐Ÿ‘ฉโ€๐ŸŒพ๐Ÿง‘โ€๐Ÿณ๐Ÿ‘จโ€๐Ÿณ๐Ÿ‘ฉโ€๐Ÿณ๐Ÿง‘โ€๐Ÿ”ง๐Ÿ‘จโ€๐Ÿ”ง๐Ÿ‘ฉโ€๐Ÿ”ง๐Ÿง‘โ€๐Ÿญ๐Ÿ‘จโ€๐Ÿญ๐Ÿ‘ฉโ€๐Ÿญ๐Ÿง‘โ€๐Ÿ’ผ๐Ÿ‘จโ€๐Ÿ’ผ๐Ÿ‘ฉโ€๐Ÿ’ผ๐Ÿง‘โ€๐Ÿ”ฌ๐Ÿ‘จโ€๐Ÿ”ฌ๐Ÿ‘ฉโ€๐Ÿ”ฌ๐Ÿง‘โ€๐Ÿ’ป๐Ÿ‘จโ€๐Ÿ’ป๐Ÿ‘ฉโ€๐Ÿ’ป๐Ÿง‘โ€๐ŸŽค๐Ÿ‘จโ€๐ŸŽค๐Ÿ‘ฉโ€๐ŸŽค๐Ÿง‘โ€๐ŸŽจ๐Ÿ‘จโ€๐ŸŽจ๐Ÿ‘ฉโ€๐ŸŽจ๐Ÿง‘โ€โœˆ๏ธ๐Ÿ‘จโ€โœˆ๏ธ๐Ÿ‘ฉโ€โœˆ๏ธ๐Ÿง‘โ€๐Ÿš€๐Ÿ‘จโ€๐Ÿš€๐Ÿ‘ฉโ€๐Ÿš€๐Ÿง‘โ€๐Ÿš’๐Ÿ‘จโ€๐Ÿš’๐Ÿ‘ฉโ€๐Ÿš’๐Ÿ‘ฎ๐Ÿ‘ฎโ€โ™‚๏ธ๐Ÿ‘ฎโ€โ™€๏ธ๐Ÿ•ต๐Ÿ•ต๏ธโ€โ™‚๏ธ๐Ÿ•ต๏ธโ€โ™€๏ธ๐Ÿ’‚๐Ÿ’‚โ€โ™‚๏ธ๐Ÿ’‚โ€โ™€๏ธ๐Ÿฅท๐Ÿ‘ท๐Ÿ‘ทโ€โ™‚๏ธ๐Ÿ‘ทโ€โ™€๏ธ๐Ÿคด๐Ÿ‘ธ๐Ÿ‘ณ๐Ÿ‘ณโ€โ™‚๏ธ๐Ÿ‘ณโ€โ™€๏ธ๐Ÿ‘ฒ๐Ÿง•๐Ÿคต๐Ÿคตโ€โ™‚๏ธ๐Ÿคตโ€โ™€๏ธ๐Ÿ‘ฐ๐Ÿ‘ฐโ€โ™‚๏ธ๐Ÿ‘ฐโ€โ™€๏ธ๐Ÿคฐ๐Ÿคฑ๐Ÿ‘ฉโ€๐Ÿผ๐Ÿ‘จโ€๐Ÿผ๐Ÿง‘โ€๐Ÿผ๐Ÿ‘ผ๐ŸŽ…๐Ÿคถ๐Ÿง‘โ€๐ŸŽ„๐Ÿฆธ๐Ÿฆธโ€โ™‚๏ธ๐Ÿฆธโ€โ™€๏ธ๐Ÿฆน๐Ÿฆนโ€โ™‚๏ธ๐Ÿฆนโ€โ™€๏ธ๐Ÿง™๐Ÿง™โ€โ™‚๏ธ๐Ÿง™โ€โ™€๏ธ๐Ÿงš๐Ÿงšโ€โ™‚๏ธ๐Ÿงšโ€โ™€๏ธ๐Ÿง›๐Ÿง›โ€โ™‚๏ธ๐Ÿง›โ€โ™€๏ธ๐Ÿงœ๐Ÿงœโ€โ™‚๏ธ๐Ÿงœโ€โ™€๏ธ๐Ÿง๐Ÿงโ€โ™‚๏ธ๐Ÿงโ€โ™€๏ธ๐Ÿงž๐Ÿงžโ€โ™‚๏ธ๐Ÿงžโ€โ™€๏ธ๐ŸงŸ๐ŸงŸโ€โ™‚๏ธ๐ŸงŸโ€โ™€๏ธ๐Ÿ’†๐Ÿ’†โ€โ™‚๏ธ๐Ÿ’†โ€โ™€๏ธ๐Ÿ’‡๐Ÿ’‡โ€โ™‚๏ธ๐Ÿ’‡โ€โ™€๏ธ๐Ÿšถ๐Ÿšถโ€โ™‚๏ธ๐Ÿšถโ€โ™€๏ธ๐Ÿง๐Ÿงโ€โ™‚๏ธ๐Ÿงโ€โ™€๏ธ๐ŸงŽ๐ŸงŽโ€โ™‚๏ธ๐ŸงŽโ€โ™€๏ธ๐Ÿง‘โ€๐Ÿฆฏ๐Ÿ‘จโ€๐Ÿฆฏ๐Ÿ‘ฉโ€๐Ÿฆฏ๐Ÿง‘โ€๐Ÿฆผ๐Ÿ‘จโ€๐Ÿฆผ๐Ÿ‘ฉโ€๐Ÿฆผ๐Ÿง‘โ€๐Ÿฆฝ๐Ÿ‘จโ€๐Ÿฆฝ๐Ÿ‘ฉโ€๐Ÿฆฝ๐Ÿƒ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™€๏ธ๐Ÿ’ƒ๐Ÿ•บ๐Ÿ•ด๐Ÿ‘ฏ๐Ÿ‘ฏโ€โ™‚๏ธ๐Ÿ‘ฏโ€โ™€๏ธ๐Ÿง–๐Ÿง–โ€โ™‚๏ธ๐Ÿง–โ€โ™€๏ธ๐Ÿง—๐Ÿง—โ€โ™‚๏ธ๐Ÿง—โ€โ™€๏ธ๐Ÿคบ๐Ÿ‡โ›ท๐Ÿ‚๐ŸŒ๐ŸŒ๏ธโ€โ™‚๏ธ๐ŸŒ๏ธโ€โ™€๏ธ๐Ÿ„๐Ÿ„โ€โ™‚๏ธ๐Ÿ„โ€โ™€๏ธ๐Ÿšฃ๐Ÿšฃโ€โ™‚๏ธ๐Ÿšฃโ€โ™€๏ธ๐ŸŠ๐ŸŠโ€โ™‚๏ธ๐ŸŠโ€โ™€๏ธโ›นโ›น๏ธโ€โ™‚๏ธโ›น๏ธโ€โ™€๏ธ๐Ÿ‹๐Ÿ‹๏ธโ€โ™‚๏ธ๐Ÿ‹๏ธโ€โ™€๏ธ๐Ÿšด๐Ÿšดโ€โ™‚๏ธ๐Ÿšดโ€โ™€๏ธ๐Ÿšต๐Ÿšตโ€โ™‚๏ธ๐Ÿšตโ€โ™€๏ธ๐Ÿคธ๐Ÿคธโ€โ™‚๏ธ๐Ÿคธโ€โ™€๏ธ๐Ÿคผ๐Ÿคผโ€โ™‚๏ธ๐Ÿคผโ€โ™€๏ธ๐Ÿคฝ๐Ÿคฝโ€โ™‚๏ธ๐Ÿคฝโ€โ™€๏ธ๐Ÿคพ๐Ÿคพโ€โ™‚๏ธ๐Ÿคพโ€โ™€๏ธ๐Ÿคน๐Ÿคนโ€โ™‚๏ธ๐Ÿคนโ€โ™€๏ธ๐Ÿง˜๐Ÿง˜โ€โ™‚๏ธ๐Ÿง˜โ€โ™€๏ธ๐Ÿ›€๐Ÿ›Œ๐Ÿง‘โ€๐Ÿคโ€๐Ÿง‘๐Ÿ‘ญ๐Ÿ‘ซ๐Ÿ‘ฌ๐Ÿ’๐Ÿ‘ฉโ€โค๏ธโ€๐Ÿ’‹โ€๐Ÿ‘จ๐Ÿ‘จโ€โค๏ธโ€๐Ÿ’‹โ€๐Ÿ‘จ๐Ÿ‘ฉโ€โค๏ธโ€๐Ÿ’‹โ€๐Ÿ‘ฉ๐Ÿ’‘๐Ÿ‘ฉโ€โค๏ธโ€๐Ÿ‘จ๐Ÿ‘จโ€โค๏ธโ€๐Ÿ‘จ๐Ÿ‘ฉโ€โค๏ธโ€๐Ÿ‘ฉ๐Ÿ‘ช๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘ฆโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ง๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆโ€๐Ÿ‘ฆ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘ฆโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ง๐Ÿ‘ฉโ€๐Ÿ‘ฆ๐Ÿ‘ฉโ€๐Ÿ‘ฆโ€๐Ÿ‘ฆ๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง๐Ÿ—ฃ๐Ÿ‘ค๐Ÿ‘ฅ๐Ÿซ‚๐Ÿ‘ฃ๐Ÿฆฐ๐Ÿฆฑ๐Ÿฆณ๐Ÿฆฒ๐Ÿต๐Ÿ’๐Ÿฆ๐Ÿฆง๐Ÿถ๐Ÿ•๐Ÿฆฎ๐Ÿ•โ€๐Ÿฆบ๐Ÿฉ๐Ÿบ๐ŸฆŠ๐Ÿฆ๐Ÿฑ๐Ÿˆ๐Ÿˆโ€โฌ›๐Ÿฆ๐Ÿฏ๐Ÿ…๐Ÿ†๐Ÿด๐ŸŽ๐Ÿฆ„๐Ÿฆ“๐ŸฆŒ๐Ÿฆฌ๐Ÿฎ๐Ÿ‚๐Ÿƒ๐Ÿ„๐Ÿท๐Ÿ–๐Ÿ—๐Ÿฝ๐Ÿ๐Ÿ‘๐Ÿ๐Ÿช๐Ÿซ๐Ÿฆ™๐Ÿฆ’๐Ÿ˜๐Ÿฆฃ๐Ÿฆ๐Ÿฆ›๐Ÿญ๐Ÿ๐Ÿ€๐Ÿน๐Ÿฐ๐Ÿ‡๐Ÿฟ๐Ÿฆซ๐Ÿฆ”๐Ÿฆ‡๐Ÿป๐Ÿปโ€โ„๏ธ๐Ÿจ๐Ÿผ๐Ÿฆฅ๐Ÿฆฆ๐Ÿฆจ๐Ÿฆ˜๐Ÿฆก๐Ÿพ๐Ÿฆƒ๐Ÿ”๐Ÿ“๐Ÿฃ๐Ÿค๐Ÿฅ๐Ÿฆ๐Ÿง๐Ÿ•Š๐Ÿฆ…๐Ÿฆ†๐Ÿฆข๐Ÿฆ‰๐Ÿฆค๐Ÿชถ๐Ÿฆฉ๐Ÿฆš๐Ÿฆœ๐Ÿธ๐ŸŠ๐Ÿข๐ŸฆŽ๐Ÿ๐Ÿฒ๐Ÿ‰๐Ÿฆ•๐Ÿฆ–๐Ÿณ๐Ÿ‹๐Ÿฌ๐Ÿฆญ๐ŸŸ๐Ÿ ๐Ÿก๐Ÿฆˆ๐Ÿ™๐Ÿš๐ŸŒ๐Ÿฆ‹๐Ÿ›๐Ÿœ๐Ÿ๐Ÿชฒ๐Ÿž๐Ÿฆ—๐Ÿชณ๐Ÿ•ท๐Ÿ•ธ๐Ÿฆ‚๐ŸฆŸ๐Ÿชฐ๐Ÿชฑ๐Ÿฆ ๐Ÿ’๐ŸŒธ๐Ÿ’ฎ๐Ÿต๐ŸŒน๐Ÿฅ€๐ŸŒบ๐ŸŒป๐ŸŒผ๐ŸŒท๐ŸŒฑ๐Ÿชด๐ŸŒฒ๐ŸŒณ๐ŸŒด๐ŸŒต๐ŸŒพ๐ŸŒฟโ˜˜๐Ÿ€๐Ÿ๐Ÿ‚๐Ÿƒ๐Ÿ‡๐Ÿˆ๐Ÿ‰๐ŸŠ๐Ÿ‹๐ŸŒ๐Ÿ๐Ÿฅญ๐ŸŽ๐Ÿ๐Ÿ๐Ÿ‘๐Ÿ’๐Ÿ“๐Ÿซ๐Ÿฅ๐Ÿ…๐Ÿซ’๐Ÿฅฅ๐Ÿฅ‘๐Ÿ†๐Ÿฅ”๐Ÿฅ•๐ŸŒฝ๐ŸŒถ๐Ÿซ‘๐Ÿฅ’๐Ÿฅฌ๐Ÿฅฆ๐Ÿง„๐Ÿง…๐Ÿ„๐Ÿฅœ๐ŸŒฐ๐Ÿž๐Ÿฅ๐Ÿฅ–๐Ÿซ“๐Ÿฅจ๐Ÿฅฏ๐Ÿฅž๐Ÿง‡๐Ÿง€๐Ÿ–๐Ÿ—๐Ÿฅฉ๐Ÿฅ“๐Ÿ”๐ŸŸ๐Ÿ•๐ŸŒญ๐Ÿฅช๐ŸŒฎ๐ŸŒฏ๐Ÿซ”๐Ÿฅ™๐Ÿง†๐Ÿฅš๐Ÿณ๐Ÿฅ˜๐Ÿฒ๐Ÿซ•๐Ÿฅฃ๐Ÿฅ—๐Ÿฟ๐Ÿงˆ๐Ÿง‚๐Ÿฅซ๐Ÿฑ๐Ÿ˜๐Ÿ™๐Ÿš๐Ÿ›๐Ÿœ๐Ÿ๐Ÿ ๐Ÿข๐Ÿฃ๐Ÿค๐Ÿฅ๐Ÿฅฎ๐Ÿก๐ŸฅŸ๐Ÿฅ ๐Ÿฅก๐Ÿฆ€๐Ÿฆž๐Ÿฆ๐Ÿฆ‘๐Ÿฆช๐Ÿฆ๐Ÿง๐Ÿจ๐Ÿฉ๐Ÿช๐ŸŽ‚๐Ÿฐ๐Ÿง๐Ÿฅง๐Ÿซ๐Ÿฌ๐Ÿญ๐Ÿฎ๐Ÿฏ๐Ÿผ๐Ÿฅ›โ˜•๐Ÿซ–๐Ÿต๐Ÿถ๐Ÿพ๐Ÿท๐Ÿธ๐Ÿน๐Ÿบ๐Ÿป๐Ÿฅ‚๐Ÿฅƒ๐Ÿฅค๐Ÿง‹๐Ÿงƒ๐Ÿง‰๐ŸงŠ๐Ÿฅข๐Ÿฝ๐Ÿด๐Ÿฅ„๐Ÿ”ช๐Ÿบ๐ŸŒ๐ŸŒŽ๐ŸŒ๐ŸŒ๐Ÿ—บ๐Ÿ—พ๐Ÿงญ๐Ÿ”โ›ฐ๐ŸŒ‹๐Ÿ—ป๐Ÿ•๐Ÿ–๐Ÿœ๐Ÿ๐Ÿž๐ŸŸ๐Ÿ›๐Ÿ—๐Ÿงฑ๐Ÿชจ๐Ÿชต๐Ÿ›–๐Ÿ˜๐Ÿš๐Ÿ ๐Ÿก๐Ÿข๐Ÿฃ๐Ÿค๐Ÿฅ๐Ÿฆ๐Ÿจ๐Ÿฉ๐Ÿช๐Ÿซ๐Ÿฌ๐Ÿญ๐Ÿฏ๐Ÿฐ๐Ÿ’’๐Ÿ—ผ๐Ÿ—ฝโ›ช๐Ÿ•Œ๐Ÿ›•๐Ÿ•โ›ฉ๐Ÿ•‹โ›ฒโ›บ๐ŸŒ๐ŸŒƒ๐Ÿ™๐ŸŒ„๐ŸŒ…๐ŸŒ†๐ŸŒ‡๐ŸŒ‰โ™จ๐ŸŽ ๐ŸŽก๐ŸŽข๐Ÿ’ˆ๐ŸŽช๐Ÿš‚๐Ÿšƒ๐Ÿš„๐Ÿš…๐Ÿš†๐Ÿš‡๐Ÿšˆ๐Ÿš‰๐ŸšŠ๐Ÿš๐Ÿšž๐Ÿš‹๐ŸšŒ๐Ÿš๐ŸšŽ๐Ÿš๐Ÿš‘๐Ÿš’๐Ÿš“๐Ÿš”๐Ÿš•๐Ÿš–๐Ÿš—๐Ÿš˜๐Ÿš™๐Ÿ›ป๐Ÿšš๐Ÿš›๐Ÿšœ๐ŸŽ๐Ÿ๐Ÿ›ต๐Ÿฆฝ๐Ÿฆผ๐Ÿ›บ๐Ÿšฒ๐Ÿ›ด๐Ÿ›น๐Ÿ›ผ๐Ÿš๐Ÿ›ฃ๐Ÿ›ค๐Ÿ›ขโ›ฝ๐Ÿšจ๐Ÿšฅ๐Ÿšฆ๐Ÿ›‘๐Ÿšงโš“โ›ต๐Ÿ›ถ๐Ÿšค๐Ÿ›ณโ›ด๐Ÿ›ฅ๐Ÿšขโœˆ๐Ÿ›ฉ๐Ÿ›ซ๐Ÿ›ฌ๐Ÿช‚๐Ÿ’บ๐Ÿš๐ŸšŸ๐Ÿš ๐Ÿšก๐Ÿ›ฐ๐Ÿš€๐Ÿ›ธ๐Ÿ›Ž๐ŸงณโŒ›โณโŒšโฐโฑโฒ๐Ÿ•ฐ๐Ÿ•›๐Ÿ•ง๐Ÿ•๐Ÿ•œ๐Ÿ•‘๐Ÿ•๐Ÿ•’๐Ÿ•ž๐Ÿ•“๐Ÿ•Ÿ๐Ÿ•”๐Ÿ• ๐Ÿ••๐Ÿ•ก๐Ÿ•–๐Ÿ•ข๐Ÿ•—๐Ÿ•ฃ๐Ÿ•˜๐Ÿ•ค๐Ÿ•™๐Ÿ•ฅ๐Ÿ•š๐Ÿ•ฆ๐ŸŒ‘๐ŸŒ’๐ŸŒ“๐ŸŒ”๐ŸŒ•๐ŸŒ–๐ŸŒ—๐ŸŒ˜๐ŸŒ™๐ŸŒš๐ŸŒ›๐ŸŒœ๐ŸŒกโ˜€๐ŸŒ๐ŸŒž๐Ÿชโญ๐ŸŒŸ๐ŸŒ ๐ŸŒŒโ˜โ›…โ›ˆ๐ŸŒค๐ŸŒฅ๐ŸŒฆ๐ŸŒง๐ŸŒจ๐ŸŒฉ๐ŸŒช๐ŸŒซ๐ŸŒฌ๐ŸŒ€๐ŸŒˆ๐ŸŒ‚โ˜‚โ˜”โ›ฑโšกโ„โ˜ƒโ›„โ˜„๐Ÿ”ฅ๐Ÿ’ง๐ŸŒŠ๐ŸŽƒ๐ŸŽ„๐ŸŽ†๐ŸŽ‡๐Ÿงจโœจ๐ŸŽˆ๐ŸŽ‰๐ŸŽŠ๐ŸŽ‹๐ŸŽ๐ŸŽŽ๐ŸŽ๐ŸŽ๐ŸŽ‘๐Ÿงง๐ŸŽ€๐ŸŽ๐ŸŽ—๐ŸŽŸ๐ŸŽซ๐ŸŽ–๐Ÿ†๐Ÿ…๐Ÿฅ‡๐Ÿฅˆ๐Ÿฅ‰โšฝโšพ๐ŸฅŽ๐Ÿ€๐Ÿ๐Ÿˆ๐Ÿ‰๐ŸŽพ๐Ÿฅ๐ŸŽณ๐Ÿ๐Ÿ‘๐Ÿ’๐Ÿฅ๐Ÿ“๐Ÿธ๐ŸฅŠ๐Ÿฅ‹๐Ÿฅ…โ›ณโ›ธ๐ŸŽฃ๐Ÿคฟ๐ŸŽฝ๐ŸŽฟ๐Ÿ›ท๐ŸฅŒ๐ŸŽฏ๐Ÿช€๐Ÿช๐ŸŽฑ๐Ÿ”ฎ๐Ÿช„๐Ÿงฟ๐ŸŽฎ๐Ÿ•น๐ŸŽฐ๐ŸŽฒ๐Ÿงฉ๐Ÿงธ๐Ÿช…๐Ÿช†โ™ โ™ฅโ™ฆโ™ฃโ™Ÿ๐Ÿƒ๐Ÿ€„๐ŸŽด๐ŸŽญ๐Ÿ–ผ๐ŸŽจ๐Ÿงต๐Ÿชก๐Ÿงถ๐Ÿชข๐Ÿ‘“๐Ÿ•ถ๐Ÿฅฝ๐Ÿฅผ๐Ÿฆบ๐Ÿ‘”๐Ÿ‘•๐Ÿ‘–๐Ÿงฃ๐Ÿงค๐Ÿงฅ๐Ÿงฆ๐Ÿ‘—๐Ÿ‘˜๐Ÿฅป๐Ÿฉฑ๐Ÿฉฒ๐Ÿฉณ๐Ÿ‘™๐Ÿ‘š๐Ÿ‘›๐Ÿ‘œ๐Ÿ‘๐Ÿ›๐ŸŽ’๐Ÿฉด๐Ÿ‘ž๐Ÿ‘Ÿ๐Ÿฅพ๐Ÿฅฟ๐Ÿ‘ ๐Ÿ‘ก๐Ÿฉฐ๐Ÿ‘ข๐Ÿ‘‘๐Ÿ‘’๐ŸŽฉ๐ŸŽ“๐Ÿงข๐Ÿช–โ›‘๐Ÿ“ฟ๐Ÿ’„๐Ÿ’๐Ÿ’Ž๐Ÿ”‡๐Ÿ”ˆ๐Ÿ”‰๐Ÿ”Š๐Ÿ“ข๐Ÿ“ฃ๐Ÿ“ฏ๐Ÿ””๐Ÿ”•๐ŸŽผ๐ŸŽต๐ŸŽถ๐ŸŽ™๐ŸŽš๐ŸŽ›๐ŸŽค๐ŸŽง๐Ÿ“ป๐ŸŽท๐Ÿช—๐ŸŽธ๐ŸŽน๐ŸŽบ๐ŸŽป๐Ÿช•๐Ÿฅ"; - -#[bench] -fn str_char_count_lorem(b: &mut Bencher) { - b.iter(|| black_box(LOREM).chars().count()); -} - -#[bench] -fn str_char_count_lorem_short(b: &mut Bencher) { - b.iter(|| black_box(LOREM_SHORT).chars().count()); -} - -#[bench] -fn str_char_count_emoji(b: &mut Bencher) { - b.iter(|| black_box(EMOJI).chars().count()); -} +mod char_count; +mod corpora; #[bench] fn str_validate_emoji(b: &mut Bencher) { - b.iter(|| str::from_utf8(black_box(EMOJI.as_bytes()))); + b.iter(|| str::from_utf8(black_box(corpora::emoji::LARGE.as_bytes()))); } diff --git a/library/core/benches/str/char_count.rs b/library/core/benches/str/char_count.rs new file mode 100644 index 0000000000000..f19d094114254 --- /dev/null +++ b/library/core/benches/str/char_count.rs @@ -0,0 +1,101 @@ +use super::corpora::*; +use test::{black_box, Bencher}; + +macro_rules! define_benches { + ($( fn $name: ident($arg: ident: &str) $body: block )+) => { + define_benches!(mod en_small, en::SMALL, $($name $arg $body)+); + define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+); + define_benches!(mod en_large, en::LARGE, $($name $arg $body)+); + define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+); + + define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+); + define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+); + define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+); + define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+); + + define_benches!(mod ru_small, ru::SMALL, $($name $arg $body)+); + define_benches!(mod ru_medium, ru::MEDIUM, $($name $arg $body)+); + define_benches!(mod ru_large, ru::LARGE, $($name $arg $body)+); + define_benches!(mod ru_huge, ru::HUGE, $($name $arg $body)+); + + define_benches!(mod emoji_small, emoji::SMALL, $($name $arg $body)+); + define_benches!(mod emoji_medium, emoji::MEDIUM, $($name $arg $body)+); + define_benches!(mod emoji_large, emoji::LARGE, $($name $arg $body)+); + define_benches!(mod emoji_huge, emoji::HUGE, $($name $arg $body)+); + }; + (mod $mod_name: ident, $input: expr, $($name: ident $arg: ident $body: block)+) => { + mod $mod_name { + use super::*; + $( + #[bench] + fn $name(bencher: &mut Bencher) { + let input = $input; + bencher.bytes = input.len() as u64; + let mut input_s = input.to_string(); + bencher.iter(|| { + let $arg: &str = &black_box(&mut input_s); + black_box($body) + }) + } + )+ + } + }; +} + +define_benches! { + fn case00_cur_libcore(s: &str) { + cur_libcore(s) + } + + fn case01_old_libcore(s: &str) { + old_libcore(s) + } + + fn case02_iter_increment(s: &str) { + iterator_increment(s) + } + + fn case03_manual_char_len(s: &str) { + manual_char_len(s) + } +} + +fn cur_libcore(s: &str) -> usize { + s.chars().count() +} +#[inline] +fn utf8_is_cont_byte(byte: u8) -> bool { + (byte as i8) < -64 +} +fn old_libcore(s: &str) -> usize { + s.as_bytes().iter().filter(|&&byte| !utf8_is_cont_byte(byte)).count() +} + +fn iterator_increment(s: &str) -> usize { + let mut c = 0; + for _ in s.chars() { + c += 1; + } + c +} + +fn manual_char_len(s: &str) -> usize { + let s = s.as_bytes(); + let mut c = 0; + let mut i = 0; + let l = s.len(); + while i < l { + let b = s[i]; + if b < 0x80 { + i += 1; + } else if b < 0xe0 { + i += 2; + } else if b < 0xf0 { + i += 3; + } else { + i += 4; + } + c += 1; + } + c +} diff --git a/library/core/benches/str/corpora.rs b/library/core/benches/str/corpora.rs new file mode 100644 index 0000000000000..fefde75715034 --- /dev/null +++ b/library/core/benches/str/corpora.rs @@ -0,0 +1,83 @@ +//! Exposes a number of modules with different kinds of strings. +//! +//! Each module contains `&str` constants named `SMALL`, `MEDIUM`, `LARGE`, and +//! `HUGE`. +//! +//! - The `SMALL` string is generally around 30-40 bytes. +//! - The `MEDIUM` string is generally around 600-700 bytes. +//! - The `LARGE` string is the `MEDIUM` string repeated 8x, and isย around 5kb. +//! - The `HUGE` string is the `LARGE` string repeated 8x (or the `MEDIUM` +//! string repeated 64x), and is around 40kb. +//! +//! Except for `mod emoji` (which is just a bunch of emoji), the strings were +//! pulled from (localizations of) rust-lang.org. + +macro_rules! repeat8 { + ($s:expr) => { + concat!($s, $s, $s, $s, $s, $s, $s, $s) + }; +} + +macro_rules! define_consts { + ($s:literal) => { + pub const MEDIUM: &str = $s; + pub const LARGE: &str = repeat8!($s); + pub const HUGE: &str = repeat8!(repeat8!(repeat8!($s))); + }; +} + +pub mod en { + pub const SMALL: &str = "Mary had a little lamb, Little lamb"; + define_consts! { + "Rust is blazingly fast and memory-efficient: with no runtime or garbage + collector, it can power performance-critical services, run on embedded + devices, and easily integrate with other languages. Rustโ€™s rich type system + and ownership model guarantee memory-safety and thread-safety โ€” enabling you + to eliminate many classes of bugs at compile-time. Rust has great + documentation, a friendly compiler with useful error messages, and top-notch + tooling โ€” an integrated package manager and build tool, smart multi-editor + support with auto-completion and type inspections, an auto-formatter, and + more." + } +} + +pub mod zh { + pub const SMALL: &str = "ๅบฆๆƒŠไบบไธ”ๅ†…ๅญ˜ๅˆฉ็”จ็Ž‡ๆž้ซ˜"; + define_consts! { + "Rust ้€ŸๅบฆๆƒŠไบบไธ”ๅ†…ๅญ˜ๅˆฉ็”จ็Ž‡ๆž้ซ˜ใ€‚็”ฑไบŽ\ + ๆฒกๆœ‰่ฟ่กŒๆ—ถๅ’Œๅžƒๅœพๅ›žๆ”ถ๏ผŒๅฎƒ่ƒฝๅคŸ่ƒœไปปๅฏนๆ€ง่ƒฝ่ฆ\ + ๆฑ‚็‰นๅˆซ้ซ˜็š„ๆœๅŠก๏ผŒๅฏไปฅๅœจๅตŒๅ…ฅๅผ่ฎพๅค‡ไธŠ่ฟ่กŒ๏ผŒ\ + ่ฟ˜่ƒฝ่ฝปๆพๅ’Œๅ…ถไป–่ฏญ่จ€้›†ๆˆใ€‚Rust ไธฐๅฏŒ็š„็ฑปๅž‹\ + ็ณป็ปŸๅ’Œๆ‰€ๆœ‰ๆƒๆจกๅž‹ไฟ่ฏไบ†ๅ†…ๅญ˜ๅฎ‰ๅ…จๅ’Œ็บฟ็จ‹ๅฎ‰ๅ…จ๏ผŒ\ + ่ฎฉๆ‚จๅœจ็ผ–่ฏ‘ๆœŸๅฐฑ่ƒฝๅคŸๆถˆ้™คๅ„็งๅ„ๆ ท็š„้”™่ฏฏใ€‚\ + Rust ๆ‹ฅๆœ‰ๅ‡บ่‰ฒ็š„ๆ–‡ๆกฃใ€ๅ‹ๅฅฝ็š„็ผ–่ฏ‘ๅ™จๅ’Œๆธ…ๆ™ฐ\ + ็š„้”™่ฏฏๆ็คบไฟกๆฏ๏ผŒ ่ฟ˜้›†ๆˆไบ†ไธ€ๆต็š„ๅทฅๅ…ทโ€”โ€”\ + ๅŒ…็ฎก็†ๅ™จๅ’Œๆž„ๅปบๅทฅๅ…ท๏ผŒ ๆ™บ่ƒฝๅœฐ่‡ชๅŠจ่กฅๅ…จๅ’Œ็ฑป\ + ๅž‹ๆฃ€้ชŒ็š„ๅคš็ผ–่พ‘ๅ™จๆ”ฏๆŒ๏ผŒ ไปฅๅŠ่‡ชๅŠจๆ ผๅผๅŒ–ไปฃ\ + ็ ็ญ‰็ญ‰ใ€‚" + } +} + +pub mod ru { + pub const SMALL: &str = "ะกะพั‚ะฝะธ ะบะพะผะฟะฐะฝะธะน ะฟะพ"; + define_consts! { + "ะกะพั‚ะฝะธ ะบะพะผะฟะฐะฝะธะน ะฟะพ ะฒัะตะผัƒ ะผะธั€ัƒ ะธัะฟะพะปัŒะทัƒัŽั‚ Rust ะฒ ั€ะตะฐะปัŒะฝั‹ั…\ + ะฟั€ะพะตะบั‚ะฐั… ะดะปั ะฑั‹ัั‚ั€ั‹ั… ะบั€ะพัั-ะฟะปะฐั‚ั„ะพั€ะผะตะฝะฝั‹ั… ั€ะตัˆะตะฝะธะน ั\ + ะพะณั€ะฐะฝะธั‡ะตะฝะฝั‹ะผะธ ั€ะตััƒั€ัะฐะผะธ. ะขะฐะบะธะต ะฟั€ะพะตะบั‚ั‹, ะบะฐะบ Firefox,\ + Dropbox ะธ Cloudflare, ะธัะฟะพะปัŒะทัƒัŽั‚ Rust. Rust ะพั‚ะปะธั‡ะฝะพ\ + ะฟะพะดั…ะพะดะธั‚ ะบะฐะบ ะดะปั ัั‚ะฐั€ั‚ะฐะฟะพะฒ, ั‚ะฐะบ ะธ ะดะปั ะฑะพะปัŒัˆะธั… ะบะพะผะฟะฐะฝะธะน,\ + ะบะฐะบ ะดะปั ะฒัั‚ั€ะฐะธะฒะฐะตะผั‹ั… ัƒัั‚ั€ะพะนัั‚ะฒ, ั‚ะฐะบ ะธ ะดะปั ะผะฐััˆั‚ะฐะฑะธั€ัƒะตะผั‹ั…\ + web-ัะตั€ะฒะธัะพะฒ. ะœะพะน ัะฐะผั‹ะน ะฑะพะปัŒัˆะพะน ะบะพะผะฟะปะธะผะตะฝั‚ Rust." + } +} + +pub mod emoji { + pub const SMALL: &str = "๐Ÿ˜€๐Ÿ˜ƒ๐Ÿ˜„๐Ÿ˜๐Ÿ˜†๐Ÿ˜…๐Ÿคฃ๐Ÿ˜‚๐Ÿ™‚๐Ÿ™ƒ๐Ÿ˜‰๐Ÿ˜Š๐Ÿ˜‡๐Ÿฅฐ๐Ÿ˜๐Ÿคฉ๐Ÿ˜˜"; + define_consts! { + "๐Ÿ˜€๐Ÿ˜ƒ๐Ÿ˜„๐Ÿ˜๐Ÿ˜†๐Ÿ˜…๐Ÿคฃ๐Ÿ˜‚๐Ÿ™‚๐Ÿ™ƒ๐Ÿ˜‰๐Ÿ˜Š๐Ÿ˜‡๐Ÿฅฐ๐Ÿ˜๐Ÿคฉ๐Ÿ˜˜๐Ÿ˜—โ˜บ๐Ÿ˜š๐Ÿ˜™๐Ÿฅฒ๐Ÿ˜‹๐Ÿ˜›๐Ÿ˜œ๐Ÿคช๐Ÿ˜๐Ÿค‘๐Ÿค—๐Ÿคญ๐Ÿคซ๐Ÿค”๐Ÿค๐Ÿคจ๐Ÿ˜๐Ÿ˜‘๐Ÿ˜ถ๐Ÿ˜ถโ€๐ŸŒซ๏ธ๐Ÿ˜๐Ÿ˜’\ + ๐Ÿ™„๐Ÿ˜ฌ๐Ÿ˜ฎโ€๐Ÿ’จ๐Ÿคฅ๐Ÿ˜Œ๐Ÿ˜”๐Ÿ˜ช๐Ÿคค๐Ÿ˜ด๐Ÿ˜ท๐Ÿค’๐Ÿค•๐Ÿคข๐Ÿคฎ๐Ÿคง๐Ÿฅต๐Ÿฅถ๐Ÿฅด๐Ÿ˜ต๐Ÿ˜ตโ€๐Ÿ’ซ๐Ÿคฏ๏ฟฝ๏ฟฝ๐Ÿฅณ๐Ÿฅธ๐Ÿ˜Ž๐Ÿค“๐Ÿง๐Ÿ˜•๐Ÿ˜Ÿ๐Ÿ™โ˜น๐Ÿ˜ฎ๐Ÿ˜ฏ๐Ÿ˜ฒ๐Ÿ˜ณ๐Ÿฅบ๐Ÿ˜ฆ๐Ÿ˜ง๐Ÿ˜จ\ + ๐Ÿ˜ฐ๐Ÿ˜ฅ๐Ÿ˜ข๐Ÿ˜ญ๐Ÿ˜ฑ๐Ÿ˜–๐Ÿ˜ฃ๐Ÿ˜ž๐Ÿ˜“๐Ÿ˜ฉ๐Ÿ˜ซ๐Ÿฅฑ๐Ÿ˜ค๐Ÿ˜ก๐Ÿ˜ ๐Ÿคฌ๐Ÿ˜ˆ๐Ÿ‘ฟ๐Ÿ’€โ˜ ๐Ÿ’ฉ๐Ÿคก๐Ÿ‘น๐Ÿ‘บ๐Ÿ‘ป๐Ÿ‘ฝ๐Ÿ‘พ๐Ÿค–๐Ÿ˜บ๐Ÿ˜ธ๐Ÿ˜น๐Ÿ˜ป๐Ÿ˜ผ๐Ÿ˜ฝ๐Ÿ™€๐Ÿ˜ฟ๐Ÿ˜พ๐Ÿ™ˆ๐Ÿ™‰๐Ÿ™Š\ + ๐Ÿ’‹๐Ÿ’Œ๐Ÿ’˜๐Ÿ’๐Ÿ’–๐Ÿ’—๐Ÿ’“๏ฟฝ๏ฟฝ๐Ÿ’•๐Ÿ’Ÿโฃ๐Ÿ’”โค๏ธโ€๐Ÿ”ฅโค๏ธโ€๐Ÿฉนโค๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ๐ŸคŽ๐Ÿ–ค๐Ÿค๐Ÿ’ฏ๐Ÿ’ข๐Ÿ’ฅ๐Ÿ’ซ๐Ÿ’ฆ๐Ÿ’จ๐Ÿ•ณ๐Ÿ’ฌ๐Ÿ‘๏ธโ€๐Ÿ—จ๏ธ๐Ÿ—จ๐Ÿ—ฏ๐Ÿ’ญ๐Ÿ’ค๐Ÿ‘‹\ + ๐Ÿคš๐Ÿ–โœ‹๐Ÿ––๐Ÿ‘Œ๐ŸคŒ๐ŸคโœŒ" + } +} diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs new file mode 100644 index 0000000000000..464c6889c323a --- /dev/null +++ b/library/core/src/str/count.rs @@ -0,0 +1,116 @@ +//! Code for efficiently counting the number of `char`s in a UTF-8 encoded +//! string. +//! +//! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`, +//! followed by some number (possibly 0) of continuation bytes. +//! +//! The leading byte can have a number of bit-patterns (with the specific +//! pattern indicating how many continuation bytes follow), but the continuation +//! bytes are always in the format `0b10XX_XXXX` (where the `X`s can take any +//! value). That is, the most significant bit is set, and the second most +//! significant bit is unset. +//! +//! To count the number of characters, we can just count the number of bytes in +//! the string which are not continuation bytes, which can be done many bytes at +//! a time fairly easily. +//! +//! Note: Because the term "leading byte" can sometimes be ambiguous (for +//! example, it could also refer to the first byte of a slice), we'll often use +//! the term "non-continuation byte" to refer to these bytes in the code. + +pub(super) fn count_chars(s: &str) -> usize { + // For correctness, `CHUNK_SIZE` must be: + // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`. + // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the + // `body.chunks(CHUNK_SIZE)` loop. + // + // For performance, `CHUNK_SIZE` should be: + // - Relatively cheap to `%` against. + // - Large enough to avoid paying for the cost of the `sum_bytes_in_usize` + // too often. + const CHUNK_SIZE: usize = 192; + const UNROLL_INNER: usize = 4; + + // Check the properties of `CHUNK_SIZE` / `UNROLL_INNER` that are required + // for correctness. + const _: [(); 1] = [(); (CHUNK_SIZE < 256 && (CHUNK_SIZE % UNROLL_INNER) == 0) as usize]; + // SAFETY: transmuting `[u8]` to `[usize]` is safe except for size + // differences which are handled by `align_to`. + let (head, body, tail) = unsafe { s.as_bytes().align_to::() }; + + let mut total = char_count_general_case(head) + char_count_general_case(tail); + // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which + // we call `sum_bytes_in_usize`. + for chunk in body.chunks(CHUNK_SIZE) { + // We accumulate intermediate sums in `counts`, where each byte contains + // a subset of the sum of this chunk, like a `[u8; size_of::()]`. + let mut counts = 0; + let unrolled_chunks = chunk.array_chunks::(); + // If there's a remainder (know can only happen for the last item in + // `chunks`, because `CHUNK_SIZE % UNROLL == 0`), then we need to + // account for that (although we don't use it to later). + let remainder = unrolled_chunks.remainder(); + for unrolled in unrolled_chunks { + for &word in unrolled { + // Because `CHUNK_SIZE` is < 256, this addition can't cause the + // count in any of the bytes to overflow into a subsequent byte. + counts += contains_non_continuation_byte(word); + } + } + + // Sum the values in `counts` (which, again, is conceptually a `[u8; + // size_of::()]`), and accumulate the result into `total`. + total += sum_bytes_in_usize(counts); + + // If there's any data in `remainder`, then handle it. This will only + // happen for the last `chunk` in `body.chunks()` (because `CHUNK_SIZE` + // is divisible by `UNROLL_INNER`), so we explicitly break at the end + // (which seems to help LLVM out). + if !remainder.is_empty() { + // Accumulate all the data in the remainder. + let mut counts = 0; + for &word in remainder { + counts += contains_non_continuation_byte(word); + } + total += sum_bytes_in_usize(counts); + break; + } + } + total +} + +// Checks each byte of `w` to see if it contains the first byte in a UTF-8 +// sequence. Bytes in `w` which are continuation bytes are left as `0x00` (e.g. +// false), and bytes which are non-continuation bytes are left as `0x01` (e.g. +// true) +#[inline] +fn contains_non_continuation_byte(w: usize) -> usize { + let lsb = 0x0101_0101_0101_0101u64 as usize; + ((!w >> 7) | (w >> 6)) & lsb +} + +// Morally equivalent to `values.to_ne_bytes().into_iter().sum::()`, but +// more efficient. +#[inline] +fn sum_bytes_in_usize(values: usize) -> usize { + const LSB_SHORTS: usize = 0x0001_0001_0001_0001_u64 as usize; + const SKIP_BYTES: usize = 0x00ff_00ff_00ff_00ff_u64 as usize; + + let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES); + pair_sum.wrapping_mul(LSB_SHORTS) >> ((core::mem::size_of::() - 2) * 8) +} + +// This is the most direct implementation of the concept of "count the number of +// bytes in the string which are not continuation bytes", and is used for the +// head and tail of the input string (the first and last item in the tuple +// returned by `slice::align_to`). +fn char_count_general_case(s: &[u8]) -> usize { + const CONT_MASK_U8: u8 = 0b0011_1111; + const TAG_CONT_U8: u8 = 0b1000_0000; + let mut leads = 0; + for &byte in s { + let is_lead = (byte & !CONT_MASK_U8) != TAG_CONT_U8; + leads += is_lead as usize; + } + leads +} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index de6e6d52b3625..e529bccbc7999 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -12,7 +12,7 @@ use crate::slice::{self, Split as SliceSplit}; use super::from_utf8_unchecked; use super::pattern::Pattern; use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; -use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte}; +use super::validations::{next_code_point, next_code_point_reverse}; use super::LinesAnyMap; use super::{BytesIsNotEmpty, UnsafeBytesToStr}; use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode}; @@ -46,8 +46,7 @@ impl<'a> Iterator for Chars<'a> { #[inline] fn count(self) -> usize { - // length in `char` is equal to the number of non-continuation bytes - self.iter.filter(|&&byte| !utf8_is_cont_byte(byte)).count() + super::count::count_chars(self.as_str()) } #[inline] diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 1d4600fa4a2d7..fceea2366da54 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -7,6 +7,7 @@ #![stable(feature = "rust1", since = "1.0.0")] mod converts; +mod count; mod error; mod iter; mod traits; From ed01324835fd4d2aef53ba0b767562640e2f9c71 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Sat, 30 Oct 2021 07:52:19 -0700 Subject: [PATCH 2/5] Fix zh::SMALL string in core::str benchmarks --- library/core/benches/str/corpora.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/core/benches/str/corpora.rs b/library/core/benches/str/corpora.rs index fefde75715034..04e60f0144a02 100644 --- a/library/core/benches/str/corpora.rs +++ b/library/core/benches/str/corpora.rs @@ -42,7 +42,7 @@ pub mod en { } pub mod zh { - pub const SMALL: &str = "ๅบฆๆƒŠไบบไธ”ๅ†…ๅญ˜ๅˆฉ็”จ็Ž‡ๆž้ซ˜"; + pub const SMALL: &str = "้€ŸๅบฆๆƒŠไบบไธ”ๅ†…ๅญ˜ๅˆฉ็”จ็Ž‡ๆž้ซ˜"; define_consts! { "Rust ้€ŸๅบฆๆƒŠไบบไธ”ๅ†…ๅญ˜ๅˆฉ็”จ็Ž‡ๆž้ซ˜ใ€‚็”ฑไบŽ\ ๆฒกๆœ‰่ฟ่กŒๆ—ถๅ’Œๅžƒๅœพๅ›žๆ”ถ๏ผŒๅฎƒ่ƒฝๅคŸ่ƒœไปปๅฏนๆ€ง่ƒฝ่ฆ\ From 002aaf2c65765408c8fddb7e3384d224f35fd1f1 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Sat, 30 Oct 2021 14:04:49 -0700 Subject: [PATCH 3/5] Ensure non-power-of-two sizes are tested in the Chars::count test --- library/alloc/tests/str.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 3dcbc54be4e8d..7b07821ab1d31 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -2234,9 +2234,11 @@ fn utf8_chars() { #[test] fn utf8_char_counts() { let strs = [("e", 1), ("รฉ", 1), ("โ‚ฌ", 1), ("\u{10000}", 1), ("eรฉโ‚ฌ\u{10000}", 4)]; - let mut reps = vec![1, 8, 64, 256, 512, 1024]; + let mut reps = + [8, 64, 256, 512, 1024].iter().copied().flat_map(|n| n - 8..=n + 8).collect::>(); if cfg!(not(miri)) { - reps.push(1 << 16); + let big = 1 << 16; + reps.extend(big - 8..=big + 8); } let counts = if cfg!(miri) { 0..1 } else { 0..8 }; let padding = counts.map(|len| " ".repeat(len)).collect::>(); From ebbccaf6bf5da7ff83661969f762c86219a96779 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Mon, 31 Jan 2022 19:07:08 -0800 Subject: [PATCH 4/5] Respond to review feedback, and improve implementation somewhat --- library/core/benches/str/char_count.rs | 18 +++++--- library/core/benches/str/corpora.rs | 9 +++- library/core/src/str/count.rs | 60 +++++++++++++++++--------- 3 files changed, 59 insertions(+), 28 deletions(-) diff --git a/library/core/benches/str/char_count.rs b/library/core/benches/str/char_count.rs index f19d094114254..25d9b2e299223 100644 --- a/library/core/benches/str/char_count.rs +++ b/library/core/benches/str/char_count.rs @@ -3,21 +3,25 @@ use test::{black_box, Bencher}; macro_rules! define_benches { ($( fn $name: ident($arg: ident: &str) $body: block )+) => { + define_benches!(mod en_tiny, en::TINY, $($name $arg $body)+); define_benches!(mod en_small, en::SMALL, $($name $arg $body)+); define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+); define_benches!(mod en_large, en::LARGE, $($name $arg $body)+); define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+); + define_benches!(mod zh_tiny, zh::TINY, $($name $arg $body)+); define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+); define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+); define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+); define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+); + define_benches!(mod ru_tiny, ru::TINY, $($name $arg $body)+); define_benches!(mod ru_small, ru::SMALL, $($name $arg $body)+); define_benches!(mod ru_medium, ru::MEDIUM, $($name $arg $body)+); define_benches!(mod ru_large, ru::LARGE, $($name $arg $body)+); define_benches!(mod ru_huge, ru::HUGE, $($name $arg $body)+); + define_benches!(mod emoji_tiny, emoji::TINY, $($name $arg $body)+); define_benches!(mod emoji_small, emoji::SMALL, $($name $arg $body)+); define_benches!(mod emoji_medium, emoji::MEDIUM, $($name $arg $body)+); define_benches!(mod emoji_large, emoji::LARGE, $($name $arg $body)+); @@ -43,12 +47,12 @@ macro_rules! define_benches { } define_benches! { - fn case00_cur_libcore(s: &str) { - cur_libcore(s) + fn case00_libcore(s: &str) { + libcore(s) } - fn case01_old_libcore(s: &str) { - old_libcore(s) + fn case01_filter_count_cont_bytes(s: &str) { + filter_count_cont_bytes(s) } fn case02_iter_increment(s: &str) { @@ -60,14 +64,16 @@ define_benches! { } } -fn cur_libcore(s: &str) -> usize { +fn libcore(s: &str) -> usize { s.chars().count() } + #[inline] fn utf8_is_cont_byte(byte: u8) -> bool { (byte as i8) < -64 } -fn old_libcore(s: &str) -> usize { + +fn filter_count_cont_bytes(s: &str) -> usize { s.as_bytes().iter().filter(|&&byte| !utf8_is_cont_byte(byte)).count() } diff --git a/library/core/benches/str/corpora.rs b/library/core/benches/str/corpora.rs index 04e60f0144a02..b4ac625061dfa 100644 --- a/library/core/benches/str/corpora.rs +++ b/library/core/benches/str/corpora.rs @@ -1,8 +1,9 @@ //! Exposes a number of modules with different kinds of strings. //! -//! Each module contains `&str` constants named `SMALL`, `MEDIUM`, `LARGE`, and -//! `HUGE`. +//! Each module contains `&str` constants named `TINY`, `SMALL`, `MEDIUM`, +//! `LARGE`, and `HUGE`. //! +//! - The `TINY` string is generally around 8 bytes. //! - The `SMALL` string is generally around 30-40 bytes. //! - The `MEDIUM` string is generally around 600-700 bytes. //! - The `LARGE` string is the `MEDIUM` string repeated 8x, and isย around 5kb. @@ -27,6 +28,7 @@ macro_rules! define_consts { } pub mod en { + pub const TINY: &str = "Mary had"; pub const SMALL: &str = "Mary had a little lamb, Little lamb"; define_consts! { "Rust is blazingly fast and memory-efficient: with no runtime or garbage @@ -42,6 +44,7 @@ pub mod en { } pub mod zh { + pub const TINY: &str = "้€ŸๅบฆๆƒŠ"; pub const SMALL: &str = "้€ŸๅบฆๆƒŠไบบไธ”ๅ†…ๅญ˜ๅˆฉ็”จ็Ž‡ๆž้ซ˜"; define_consts! { "Rust ้€ŸๅบฆๆƒŠไบบไธ”ๅ†…ๅญ˜ๅˆฉ็”จ็Ž‡ๆž้ซ˜ใ€‚็”ฑไบŽ\ @@ -59,6 +62,7 @@ pub mod zh { } pub mod ru { + pub const TINY: &str = "ะกะพั‚ะฝะธ"; pub const SMALL: &str = "ะกะพั‚ะฝะธ ะบะพะผะฟะฐะฝะธะน ะฟะพ"; define_consts! { "ะกะพั‚ะฝะธ ะบะพะผะฟะฐะฝะธะน ะฟะพ ะฒัะตะผัƒ ะผะธั€ัƒ ะธัะฟะพะปัŒะทัƒัŽั‚ Rust ะฒ ั€ะตะฐะปัŒะฝั‹ั…\ @@ -72,6 +76,7 @@ pub mod ru { } pub mod emoji { + pub const TINY: &str = "๐Ÿ˜€๐Ÿ˜ƒ"; pub const SMALL: &str = "๐Ÿ˜€๐Ÿ˜ƒ๐Ÿ˜„๐Ÿ˜๐Ÿ˜†๐Ÿ˜…๐Ÿคฃ๐Ÿ˜‚๐Ÿ™‚๐Ÿ™ƒ๐Ÿ˜‰๐Ÿ˜Š๐Ÿ˜‡๐Ÿฅฐ๐Ÿ˜๐Ÿคฉ๐Ÿ˜˜"; define_consts! { "๐Ÿ˜€๐Ÿ˜ƒ๐Ÿ˜„๐Ÿ˜๐Ÿ˜†๐Ÿ˜…๐Ÿคฃ๐Ÿ˜‚๐Ÿ™‚๐Ÿ™ƒ๐Ÿ˜‰๐Ÿ˜Š๐Ÿ˜‡๐Ÿฅฐ๐Ÿ˜๐Ÿคฉ๐Ÿ˜˜๐Ÿ˜—โ˜บ๐Ÿ˜š๐Ÿ˜™๐Ÿฅฒ๐Ÿ˜‹๐Ÿ˜›๐Ÿ˜œ๐Ÿคช๐Ÿ˜๐Ÿค‘๐Ÿค—๐Ÿคญ๐Ÿคซ๐Ÿค”๐Ÿค๐Ÿคจ๐Ÿ˜๐Ÿ˜‘๐Ÿ˜ถ๐Ÿ˜ถโ€๐ŸŒซ๏ธ๐Ÿ˜๐Ÿ˜’\ diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs index 464c6889c323a..a80ebac734d78 100644 --- a/library/core/src/str/count.rs +++ b/library/core/src/str/count.rs @@ -17,27 +17,57 @@ //! Note: Because the term "leading byte" can sometimes be ambiguous (for //! example, it could also refer to the first byte of a slice), we'll often use //! the term "non-continuation byte" to refer to these bytes in the code. +use core::intrinsics::unlikely; +const USIZE_SIZE: usize = core::mem::size_of::(); +const UNROLL_INNER: usize = 4; + +#[inline] pub(super) fn count_chars(s: &str) -> usize { + if s.len() < USIZE_SIZE * UNROLL_INNER { + // Avoid entering the optimized implementation for strings where the + // difference is not likely to matter, or where it might even be slower. + // That said, a ton of thought was not spent on the particular threshold + // here, beyond "this value seems to make sense". + char_count_general_case(s.as_bytes()) + } else { + do_count_chars(s) + } +} + +fn do_count_chars(s: &str) -> usize { // For correctness, `CHUNK_SIZE` must be: + // // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`. // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the // `body.chunks(CHUNK_SIZE)` loop. // // For performance, `CHUNK_SIZE` should be: - // - Relatively cheap to `%` against. + // - Relatively cheap to `/` against (so some simple sum of powers of two). // - Large enough to avoid paying for the cost of the `sum_bytes_in_usize` // too often. const CHUNK_SIZE: usize = 192; - const UNROLL_INNER: usize = 4; - // Check the properties of `CHUNK_SIZE` / `UNROLL_INNER` that are required + // Check the properties of `CHUNK_SIZE` and `UNROLL_INNER` that are required // for correctness. - const _: [(); 1] = [(); (CHUNK_SIZE < 256 && (CHUNK_SIZE % UNROLL_INNER) == 0) as usize]; + const _: () = assert!(CHUNK_SIZE < 256); + const _: () = assert!(CHUNK_SIZE % UNROLL_INNER == 0); + // SAFETY: transmuting `[u8]` to `[usize]` is safe except for size // differences which are handled by `align_to`. let (head, body, tail) = unsafe { s.as_bytes().align_to::() }; + // This should be quite rare, and basically exists to handle the degenerate + // cases where align_to fails (as well as miri under symbolic alignment + // mode). + // + // The `unlikely` helps discourage LLVM from inlining the body, which is + // nice, as we would rather not mark the `char_count_general_case` function + // as cold. + if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) { + return char_count_general_case(s.as_bytes()); + } + let mut total = char_count_general_case(head) + char_count_general_case(tail); // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which // we call `sum_bytes_in_usize`. @@ -45,11 +75,8 @@ pub(super) fn count_chars(s: &str) -> usize { // We accumulate intermediate sums in `counts`, where each byte contains // a subset of the sum of this chunk, like a `[u8; size_of::()]`. let mut counts = 0; - let unrolled_chunks = chunk.array_chunks::(); - // If there's a remainder (know can only happen for the last item in - // `chunks`, because `CHUNK_SIZE % UNROLL == 0`), then we need to - // account for that (although we don't use it to later). - let remainder = unrolled_chunks.remainder(); + + let (unrolled_chunks, remainder) = chunk.as_chunks::(); for unrolled in unrolled_chunks { for &word in unrolled { // Because `CHUNK_SIZE` is < 256, this addition can't cause the @@ -85,8 +112,8 @@ pub(super) fn count_chars(s: &str) -> usize { // true) #[inline] fn contains_non_continuation_byte(w: usize) -> usize { - let lsb = 0x0101_0101_0101_0101u64 as usize; - ((!w >> 7) | (w >> 6)) & lsb + const LSB: usize = 0x0101_0101_0101_0101u64 as usize; + ((!w >> 7) | (w >> 6)) & LSB } // Morally equivalent to `values.to_ne_bytes().into_iter().sum::()`, but @@ -97,7 +124,7 @@ fn sum_bytes_in_usize(values: usize) -> usize { const SKIP_BYTES: usize = 0x00ff_00ff_00ff_00ff_u64 as usize; let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES); - pair_sum.wrapping_mul(LSB_SHORTS) >> ((core::mem::size_of::() - 2) * 8) + pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - 2) * 8) } // This is the most direct implementation of the concept of "count the number of @@ -105,12 +132,5 @@ fn sum_bytes_in_usize(values: usize) -> usize { // head and tail of the input string (the first and last item in the tuple // returned by `slice::align_to`). fn char_count_general_case(s: &[u8]) -> usize { - const CONT_MASK_U8: u8 = 0b0011_1111; - const TAG_CONT_U8: u8 = 0b1000_0000; - let mut leads = 0; - for &byte in s { - let is_lead = (byte & !CONT_MASK_U8) != TAG_CONT_U8; - leads += is_lead as usize; - } - leads + s.iter().filter(|&&byte| !super::validations::utf8_is_cont_byte(byte)).count() } From 41f821461f0235912501a58feb0d86c58167a015 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Sat, 5 Feb 2022 11:17:10 -0800 Subject: [PATCH 5/5] Fix comment grammar for `do_count_chars` --- library/core/src/str/count.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs index a80ebac734d78..5abc2b34c075a 100644 --- a/library/core/src/str/count.rs +++ b/library/core/src/str/count.rs @@ -40,7 +40,7 @@ fn do_count_chars(s: &str) -> usize { // // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`. // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the - // `body.chunks(CHUNK_SIZE)` loop. + // `body.chunks(CHUNK_SIZE)` loop is incorrect. // // For performance, `CHUNK_SIZE` should be: // - Relatively cheap to `/` against (so some simple sum of powers of two).