/
sherlock.rs
230 lines (206 loc) · 9.07 KB
/
sherlock.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
use test::Bencher;
use crate::{Regex, Text};
// USAGE: sherlock!(name, pattern, count)
//
// This is same as bench_find, except it always uses the Sherlock haystack.
macro_rules! sherlock {
($name:ident, $pattern:expr, $count:expr) => {
bench_find!(
$name,
$pattern,
$count,
include_str!("data/sherlock.txt").to_owned()
);
};
}
// These patterns are all single string literals that compile down to a variant
// of Boyer-Moore w/ memchr. This also demonstrates the impact that the
// frequency of a match has on performance.
sherlock!(name_sherlock, r"Sherlock", 97);
sherlock!(name_holmes, r"Holmes", 461);
sherlock!(name_sherlock_holmes, r"Sherlock Holmes", 91);
// Like the above, except case insensitively. The prefix detector will extract
// multiple *cut* prefix literals for each of the following before hitting its
// limit. All of these should be able to use either memchr2 or memchr3.
// std C++ does not support inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
sherlock!(name_sherlock_nocase, r"(?i)Sherlock", 102);
// std C++ does not support inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
sherlock!(name_holmes_nocase, r"(?i)Holmes", 467);
// std C++ does not support inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
sherlock!(name_sherlock_holmes_nocase, r"(?i)Sherlock Holmes", 96);
// Will quickly find instances of 'Sherlock', but then needs to fall back to
// the lazy DFA to process the Unicode aware `\s`.
sherlock!(name_whitespace, r"Sherlock\s+Holmes", 97);
// Now try more variations on name matching.
// This one has two alternates that both start with 'S'. This should compile
// to an Aho-Corasick automaton that uses memchr. Never enters lazy DFA.
sherlock!(name_alt1, r"Sherlock|Street", 158);
// This one doesn't have a common byte, but should still use Aho-Corasick and
// memchr2.
// Never enters lazy DFA.
sherlock!(name_alt2, r"Sherlock|Holmes", 558);
// Still using Aho-Corasick, but more patterns. Never enters lazy DFA but
// also can't use any memchr variant.
sherlock!(name_alt3, r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker", 740);
// Still using Aho-Corasick, but needs the lazy DFA.
// std C++ does not support inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
sherlock!(
name_alt3_nocase,
r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker",
753
);
// Should still use Aho-Corasick for the prefixes in each alternate, but
// we need to use the lazy DFA to complete it.
sherlock!(name_alt4, r"Sher[a-z]+|Hol[a-z]+", 582);
// std C++ does not support inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
sherlock!(name_alt4_nocase, r"(?i)Sher[a-z]+|Hol[a-z]+", 697);
// Uses Aho-Corasick, but can use memchr3 (unlike name_alt3).
sherlock!(name_alt5, r"Sherlock|Holmes|Watson", 639);
// std C++ does not support inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
sherlock!(name_alt5_nocase, r"(?i)Sherlock|Holmes|Watson", 650);
// How long does it take to discover that there's no match? In the first two
// cases, we detect the rarest byte in the literal to run memchr on. In the
// first, it's 'z' and in the second it's 'j'. The third case only has common
// letters, and is therefore slower.
sherlock!(no_match_uncommon, r"zqj", 0);
sherlock!(no_match_common, r"aqj", 0);
sherlock!(no_match_really_common, r"aei", 0);
// Various twiddling on very common words. This tends to stress the constant
// overhead of actually reporting a match. (None of these actually enter any
// matching engines.)
sherlock!(the_lower, r"the", 7218);
sherlock!(the_upper, r"The", 741);
// std C++ does not support inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
sherlock!(the_nocase, r"(?i)the", 7987);
// Process whitespace after a very common word.
// Uses Boyer-Moore to find `the` and the lazy DFA for the rest.
sherlock!(the_whitespace, r"the\s+\w+", 5410);
// How fast can we match everything? This essentially defeats any clever prefix
// tricks and just executes the DFA across the entire input.
#[cfg(not(feature = "re-dphobos"))]
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(everything_greedy, r".*", 13053);
// std::regex . does not match \r
#[cfg(any(feature = "re-stdcpp", feature = "re-boost",))]
sherlock!(everything_greedy, r"[^\n]*", 13053);
#[cfg(not(feature = "re-dphobos"))]
#[cfg(not(feature = "re-onig"))]
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
// std C++ does not support inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(everything_greedy_nl, r"(?s).*", 1);
// How fast can we match every letter? This also defeats any clever prefix
// tricks.
// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters, r"\p{L}", 447160);
// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters_upper, r"\p{Lu}", 14180);
// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters_lower, r"\p{Ll}", 432980);
// Similarly, for words.
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-re2"))]
sherlock!(words, r"\w+", 109214);
#[cfg(any(feature = "re-stdcpp", feature = "re-boost", feature = "re-re2",))]
sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?
// Find complete words before Holmes. The `\w` defeats any prefix
// optimizations.
sherlock!(before_holmes, r"\w+\s+Holmes", 319);
// Find complete words before Holmes. Both of the `\w`s defeat any prefix
// and suffix optimizations.
sherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137);
// Find Holmes co-occurring with Watson in a particular window of characters.
// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
// the rest.
sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);
// Find Holmes co-occurring with Watson in a particular window of words.
// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
// the rest.
#[cfg(not(feature = "re-onig"))]
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
#[cfg(not(feature = "re-dphobos-dmd-ct"))]
#[cfg(not(feature = "re-dphobos-ldc-ct"))]
sherlock!(
holmes_coword_watson,
r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes",
51
);
// Find some subset of quotes in the text.
// This does detect the `"` or `'` prefix literal and does a quick scan for
// either byte before starting the lazy DFA.
sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767);
// Finds all occurrences of Sherlock Holmes at the beginning or end of a line.
// The empty assertions defeat any detection of prefix literals, so it's the
// lazy DFA the entire way.
// std C++ does not support multiline until C++17 nor the inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-dphobos"))]
sherlock!(
line_boundary_sherlock_holmes,
r"(?m)^Sherlock Holmes|Sherlock Holmes$",
34
);
// D matches both \r\n and \n as EOL
#[cfg(any(feature = "re-boost", feature = "re-dphobos",))]
sherlock!(
line_boundary_sherlock_holmes,
r"(?m)^Sherlock Holmes|Sherlock Holmes$",
37
);
// All words ending in `n`. This uses Unicode word boundaries, which the DFA
// can speculatively handle. Since this benchmark is on mostly ASCII text, it
// performs well here. A different benchmark with non-Western text would be
// more revealing since the search would be forced to fall back to an NFA
// simulation.
#[cfg(not(feature = "re-tcl"))]
sherlock!(word_ending_n, r"\b\w+n\b", 8366);
// This is a real bad one for Rust's engine. This particular expression
// fills the state cache quite frequently, which results in a lot of churn.
// This can be made to go roughly the speed of PCRE by increasing the DFA cache
// size.
//
// Its only salvation is that the DFA realizes it's executing slowly, gives up
// quickly and falls back to the NFA algorithm.
//
// RE2 seems to do a worse job at this than Rust. So much so that it's slow
// enough to be annoying, so we disable it.
#[cfg(not(feature = "re-re2"))]
sherlock!(repeated_class_negation, r"[a-q][^u-z]{13}x", 142);
// This defeats any prefix optimizations but triggers the reverse suffix
// optimization.
sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824);
// Similar to ing_suffix, but a little more complex by limiting the length
// of the word and making sure it's surrounded by whitespace. The trailing
// `\s` defeats the reverse suffix optimization.
//
// Onig does surprisingly well on this benchmark and yet does quite poorly on
// the ing_suffix benchmark. That one has me stumped.
sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081);