Permalink
Cannot retrieve contributors at this time
Join GitHub today
GitHub is home to over 28 million developers working together to host and review code, manage projects, and build software together.
Sign up
Fetching contributors…
| // Copyright 2014 The Rust Project Developers. See the COPYRIGHT | |
| // file at the top-level directory of this distribution and at | |
| // http://rust-lang.org/COPYRIGHT. | |
| // | |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
| // option. This file may not be copied, modified, or distributed | |
| // except according to those terms. | |
| use test::Bencher; | |
| use {Regex, Text}; | |
| // USAGE: sherlock!(name, pattern, count) | |
| // | |
| // This is same as bench_find, except it always uses the Sherlock haystack. | |
| macro_rules! sherlock { | |
| ($name:ident, $pattern:expr, $count:expr) => { | |
| bench_find!( | |
| $name, $pattern, $count, | |
| include_str!("data/sherlock.txt").to_owned() | |
| ); | |
| } | |
| } | |
| // These patterns are all single string literals that compile down to a variant | |
| // of Boyer-Moore w/ memchr. This also demonstrates the impact that the | |
| // frequency of a match has on performance. | |
| sherlock!(name_sherlock, r"Sherlock", 97); | |
| sherlock!(name_holmes, r"Holmes", 461); | |
| sherlock!(name_sherlock_holmes, r"Sherlock Holmes", 91); | |
| // Like the above, except case insensitively. The prefix detector will extract | |
| // multiple *cut* prefix literals for each of the following before hitting its | |
| // limit. All of these should be able to use either memchr2 or memchr3. | |
| // std C++ does not support inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| sherlock!(name_sherlock_nocase, r"(?i)Sherlock", 102); | |
| // std C++ does not support inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| sherlock!(name_holmes_nocase, r"(?i)Holmes", 467); | |
| // std C++ does not support inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| sherlock!(name_sherlock_holmes_nocase, r"(?i)Sherlock Holmes", 96); | |
| // Will quickly find instances of 'Sherlock', but then needs to fall back to | |
| // the lazy DFA to process the Unicode aware `\s`. | |
| sherlock!(name_whitespace, r"Sherlock\s+Holmes", 97); | |
| // Now try more variations on name matching. | |
| // This one has two alternates that both start with 'S'. This should compile | |
| // to an Aho-Corasick automaton that uses memchr. Never enters lazy DFA. | |
| sherlock!(name_alt1, r"Sherlock|Street", 158); | |
| // This one doesn't have a common byte, but should still use Aho-Corasick and | |
| // memchr2. | |
| // Never enters lazy DFA. | |
| sherlock!(name_alt2, r"Sherlock|Holmes", 558); | |
| // Still using Aho-Corasick, but more patterns. Never enters lazy DFA but | |
| // also can't use any memchr variant. | |
| sherlock!(name_alt3, r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker", 740); | |
| // Still using Aho-Corasick, but needs the lazy DFA. | |
| // std C++ does not support inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| sherlock!( | |
| name_alt3_nocase, | |
| r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker", | |
| 753); | |
| // Should still use Aho-Corasick for the prefixes in each alternate, but | |
| // we need to use the lazy DFA to complete it. | |
| sherlock!(name_alt4, r"Sher[a-z]+|Hol[a-z]+", 582); | |
| // std C++ does not support inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| sherlock!(name_alt4_nocase, r"(?i)Sher[a-z]+|Hol[a-z]+", 697); | |
| // Uses Aho-Corasick, but can use memchr3 (unlike name_alt3). | |
| sherlock!(name_alt5, r"Sherlock|Holmes|Watson", 639); | |
| // std C++ does not support inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| sherlock!(name_alt5_nocase, r"(?i)Sherlock|Holmes|Watson", 650); | |
| // How long does it take to discover that there's no match? In the first two | |
| // cases, we detect the rarest byte in the literal to run memchr on. In the | |
| // first, it's 'z' and in the second it's 'j'. The third case only has common | |
| // letters, and is therefore slower. | |
| sherlock!(no_match_uncommon, r"zqj", 0); | |
| sherlock!(no_match_common, r"aqj", 0); | |
| sherlock!(no_match_really_common, r"aei", 0); | |
| // Various twiddling on very common words. This tends to stress the constant | |
| // overhead of actually reporting a match. (None of these actually enter any | |
| // matching engines.) | |
| sherlock!(the_lower, r"the", 7218); | |
| sherlock!(the_upper, r"The", 741); | |
| // std C++ does not support inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| sherlock!(the_nocase, r"(?i)the", 7987); | |
| // Process whitespace after a very common word. | |
| // Uses Boyer-Moore to find `the` and the lazy DFA for the rest. | |
| sherlock!(the_whitespace, r"the\s+\w+", 5410); | |
| // How fast can we match everything? This essentially defeats any clever prefix | |
| // tricks and just executes the DFA across the entire input. | |
| #[cfg(not(feature = "re-dphobos"))] | |
| #[cfg(not(feature = "re-pcre1"))] | |
| #[cfg(not(feature = "re-pcre2"))] | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| #[cfg(not(feature = "re-boost"))] | |
| #[cfg(not(feature = "re-tcl"))] | |
| sherlock!(everything_greedy, r".*", 13053); | |
| // std::regex . does not match \r | |
| #[cfg(any( | |
| feature = "re-stdcpp", | |
| feature = "re-boost", | |
| ))] | |
| sherlock!(everything_greedy, r"[^\n]*", 13053); | |
| #[cfg(not(feature = "re-dphobos"))] | |
| #[cfg(not(feature = "re-onig"))] | |
| #[cfg(not(feature = "re-pcre1"))] | |
| #[cfg(not(feature = "re-pcre2"))] | |
| // std C++ does not support inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| #[cfg(not(feature = "re-tcl"))] | |
| sherlock!(everything_greedy_nl, r"(?s).*", 1); | |
| // How fast can we match every letter? This also defeats any clever prefix | |
| // tricks. | |
| // std C++ does not support unicode character classes | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| #[cfg(not(feature = "re-boost"))] | |
| #[cfg(not(feature = "re-tcl"))] | |
| sherlock!(letters, r"\p{L}", 447160); | |
| // std C++ does not support unicode character classes | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| #[cfg(not(feature = "re-boost"))] | |
| #[cfg(not(feature = "re-tcl"))] | |
| sherlock!(letters_upper, r"\p{Lu}", 14180); | |
| // std C++ does not support unicode character classes | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| #[cfg(not(feature = "re-boost"))] | |
| #[cfg(not(feature = "re-tcl"))] | |
| sherlock!(letters_lower, r"\p{Ll}", 432980); | |
| // Similarly, for words. | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| #[cfg(not(feature = "re-boost"))] | |
| #[cfg(not(feature = "re-re2"))] | |
| sherlock!(words, r"\w+", 109214); | |
| #[cfg(any( | |
| feature = "re-stdcpp", | |
| feature = "re-boost", | |
| feature = "re-re2", | |
| ))] | |
| sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here? | |
| // Find complete words before Holmes. The `\w` defeats any prefix | |
| // optimizations. | |
| sherlock!(before_holmes, r"\w+\s+Holmes", 319); | |
| // Find complete words before Holmes. Both of the `\w`s defeat any prefix | |
| // and suffix optimizations. | |
| sherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137); | |
| // Find Holmes co-occuring with Watson in a particular window of characters. | |
| // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for | |
| // the rest. | |
| sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7); | |
| // Find Holmes co-occuring with Watson in a particular window of words. | |
| // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for | |
| // the rest. | |
| #[cfg(not(feature = "re-onig"))] | |
| #[cfg(not(feature = "re-pcre1"))] | |
| #[cfg(not(feature = "re-pcre2"))] | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| #[cfg(not(feature = "re-boost"))] | |
| #[cfg(not(feature = "re-tcl"))] | |
| #[cfg(not(feature = "re-dphobos-dmd-ct"))] | |
| #[cfg(not(feature = "re-dphobos-ldc-ct"))] | |
| sherlock!( | |
| holmes_coword_watson, | |
| r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes", | |
| 51); | |
| // Find some subset of quotes in the text. | |
| // This does detect the `"` or `'` prefix literal and does a quick scan for | |
| // either byte before starting the lazy DFA. | |
| sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767); | |
| // Finds all occurrences of Sherlock Holmes at the beginning or end of a line. | |
| // The empty assertions defeat any detection of prefix literals, so it's the | |
| // lazy DFA the entire way. | |
| // std C++ does not support multiline until C++17 nor the inline modifier syntax | |
| #[cfg(not(feature = "re-stdcpp"))] | |
| #[cfg(not(feature = "re-boost"))] | |
| #[cfg(not(feature = "re-dphobos"))] | |
| sherlock!( | |
| line_boundary_sherlock_holmes, | |
| r"(?m)^Sherlock Holmes|Sherlock Holmes$", | |
| 34); | |
| // D matches both \r\n and \n as EOL | |
| #[cfg(any( | |
| feature = "re-boost", | |
| feature = "re-dphobos", | |
| ))] | |
| sherlock!( | |
| line_boundary_sherlock_holmes, | |
| r"(?m)^Sherlock Holmes|Sherlock Holmes$", | |
| 37); | |
| // All words ending in `n`. This uses Unicode word boundaries, which the DFA | |
| // can speculatively handle. Since this benchmark is on mostly ASCII text, it | |
| // performs well here. A different benchmark with non-Western text would be | |
| // more revealing since the search would be forced to fall back to an NFA | |
| // simulation. | |
| #[cfg(not(feature = "re-tcl"))] | |
| sherlock!(word_ending_n, r"\b\w+n\b", 8366); | |
| // This is a real bad one for Rust's engine. This particular expression | |
| // fills the state cache quite frequently, which results in a lot of churn. | |
| // This can be made to go roughly the speed of PCRE by increasing the DFA cache | |
| // size. | |
| // | |
| // Its only salvation is that the DFA realizes it's executing slowly, gives up | |
| // quickly and falls back to the NFA algorithm. | |
| // | |
| // RE2 seems to do a worse job at this than Rust. So much so that it's slow | |
| // enough to be annoying, so we disable it. | |
| #[cfg(not(feature = "re-re2"))] | |
| sherlock!(repeated_class_negation, r"[a-q][^u-z]{13}x", 142); | |
| // This defeats any prefix optimizations but triggers the reverse suffix | |
| // optimization. | |
| sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824); | |
| // Similar to ing_suffix, but a little more complex by limiting the length | |
| // of the word and making sure it's surrounded by whitespace. The trailing | |
| // `\s` defeats the reverse suffix optimization. | |
| // | |
| // Onig does surprisingly well on this benchmark and yet does quite poorly on | |
| // the ing_suffix benchmark. That one has me stumped. | |
| sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081); |