-
Notifications
You must be signed in to change notification settings - Fork 74
/
matcher.rs
286 lines (255 loc) · 10.4 KB
/
matcher.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
use anyhow::Result;
use std::sync::Mutex;
use tracing::error;
use crate::blob::Blob;
use crate::blob_id_set::BlobIdSet;
use crate::location::OffsetSpan;
use crate::matcher_stats::MatcherStats;
use crate::provenance::Provenance;
use crate::rules::Rule;
use crate::rules_database::RulesDatabase;
// -------------------------------------------------------------------------------------------------
// RawMatch
// -------------------------------------------------------------------------------------------------
/// A raw match, as recorded by a callback to Hyperscan.
///
/// When matching with Hyperscan, we simply collect all matches into a preallocated `Vec`,
/// and then go through them all after scanning is complete.
/// This type represents a raw match from a Hyperscan.
#[derive(PartialEq, Eq, Debug)]
struct RawMatch {
rule_id: u32,
start_idx: u64,
end_idx: u64,
}
// -------------------------------------------------------------------------------------------------
// BlobMatch
// -------------------------------------------------------------------------------------------------
/// A `BlobMatch` is the result type from `Matcher::scan_blob`.
///
/// It is mostly made up of references and small data.
/// For a representation that is more friendly for human consumption, see `Match`.
pub struct BlobMatch<'r, 'b> {
/// The rule that was matched
pub rule: &'r Rule,
/// The blob that was matched
pub blob: &'b Blob,
/// The matching input in `blob.input`
pub matching_input: &'b [u8],
/// The location of the matching input in `blob.input`
pub matching_input_offset_span: OffsetSpan,
/// The capture groups from the match
pub captures: regex::bytes::Captures<'b>,
}
// -------------------------------------------------------------------------------------------------
// Matcher
// -------------------------------------------------------------------------------------------------
/// A `Matcher` is able to scan inputs for matches from rules in a `RulesDatabase`.
///
/// If doing multi-threaded scanning, use a separate `Matcher` for each thread.
pub struct Matcher<'a> {
/// A scratch buffer for Hyperscan
hs_scratch: hyperscan::Scratch,
/// A scratch vector for raw matches from Hyperscan, to minimize allocation
raw_matches_scratch: Vec<RawMatch>,
/// The rules database used for matching
rules_db: &'a RulesDatabase,
/// Local statistics for this `Matcher`
local_stats: MatcherStats,
/// Global statistics, updated with the local statsistics when this `Matcher` is dropped
global_stats: Option<&'a Mutex<MatcherStats>>,
/// The set of blobs that have been seen
seen_blobs: &'a BlobIdSet,
}
/// This `Drop` implementation updates the `global_stats` with the local stats
impl<'a> Drop for Matcher<'a> {
fn drop(&mut self) {
if let Some(global_stats) = self.global_stats {
let mut global_stats = global_stats.lock().unwrap();
global_stats.update(&self.local_stats);
}
}
}
impl<'a> Matcher<'a> {
/// Create a new `Matcher` from the given `RulesDatabase`.
///
/// If `global_stats` is provided, it will be updated with the local stats from this `Matcher`
/// when it is dropped.
pub fn new(
rules_db: &'a RulesDatabase,
seen_blobs: &'a BlobIdSet,
global_stats: Option<&'a Mutex<MatcherStats>>,
) -> Result<Self> {
Ok(Matcher {
hs_scratch: rules_db.hsdb.alloc_scratch()?,
raw_matches_scratch: Vec::with_capacity(16384),
rules_db,
local_stats: MatcherStats::default(),
global_stats,
seen_blobs,
})
}
#[inline]
fn scan_bytes_raw(&mut self, input: &[u8]) -> Result<()> {
self.raw_matches_scratch.clear();
let input_len: u64 = input.len().try_into().unwrap();
self.rules_db
.hsdb
.scan(input, &self.hs_scratch, |id: u32, from: u64, to: u64, _flags: u32| {
// let start_idx = if from == hyperscan_sys::HS_OFFSET_PAST_HORIZON { 0 } else { from };
//
// NOTE: `from` is only going to be meaningful here if we start compiling rules
// with the HS_SOM_LEFTMOST flag. But it doesn't seem to hurt to use the 0-value
// provided when that flag is not used.
let start_idx = from.min(input_len);
self.raw_matches_scratch.push(RawMatch {
rule_id: id,
start_idx,
end_idx: to,
});
hyperscan::Matching::Continue
})?;
Ok(())
}
/// Scan a blob.
///
/// `provenance` is used only for diagnostic purposes if something goes wrong.
// #[inline]
pub fn scan_blob<'b>(
&mut self,
blob: &'b Blob,
provenance: &Provenance,
) -> Result<Vec<BlobMatch<'a, 'b>>> {
// --------------------
// Update local stats
// --------------------
self.local_stats.blobs_seen += 1;
let nbytes = blob.bytes.len() as u64;
self.local_stats.bytes_seen += nbytes;
if !self.seen_blobs.insert(blob.id) {
// debug!("Blob {} already seen; skipping", &blob.id);
return Ok(Vec::new());
}
self.local_stats.blobs_scanned += 1;
self.local_stats.bytes_scanned += nbytes;
// --------------------
// Actually scan the content
// --------------------
self.scan_bytes_raw(&blob.bytes)?;
if self.raw_matches_scratch.is_empty() {
// No matches! We can exit early and save work.
return Ok(Vec::new());
}
// --------------------
// Perform second-stage regex matching to get groups and precise start locations
//
// Also deduplicate overlapping matches with the same rule
// --------------------
self.raw_matches_scratch.sort_by_key(|m| {
debug_assert!(m.start_idx <= m.end_idx);
(m.rule_id, m.end_idx, m.end_idx - m.start_idx)
});
let rules = &self.rules_db.rules.rules;
let anchored_regexes = &self.rules_db.anchored_regexes;
// (rule id, regex captures) from most recently emitted match
let mut previous: Option<(usize, OffsetSpan)> = None;
// note that we walk _backwards_ over the raw matches: this allows us to detect and
// suppress overlapping matches in a single pass
let matches = self.raw_matches_scratch.iter().rev()
.filter_map(|&RawMatch{ rule_id, start_idx, end_idx }| {
let rule_id = rule_id as usize;
let start_idx = start_idx as usize;
let end_idx = end_idx as usize;
let rule = &rules[rule_id];
let re = &anchored_regexes[rule_id];
// second-stage regex match
let captures = match re.captures(&blob.bytes[start_idx..end_idx]) {
None => {
// static ONCE: std::sync::Once = std::sync::Once::new();
// ONCE.call_once(|| {
let cxt = String::from_utf8_lossy(
&blob.bytes[end_idx.saturating_sub(400)..end_idx]
);
error!("\
Regex failed to match where hyperscan did; something is probably odd about the rule:\n\
Blob: {}\n\
Provenance: {:?}\n\
Offsets: [{}..{}]\n\
Rule id: {}\n\
Rule name: {:?}:\n\
Regex: {:?}:\n\
Snippet: {:?}",
&blob.id,
provenance,
start_idx,
end_idx,
rule_id,
rule.name,
re,
cxt,
);
// });
return None;
}
Some(cs) => { cs }
};
let matching_input = captures.get(0).expect("regex captures should have group for entire match");
let matching_input_offset_span = OffsetSpan::from_range(matching_input.range());
// deduplicate overlaps
let suppress = match &previous {
None => false,
Some((prev_rule_id, prev_loc)) => {
*prev_rule_id == rule_id && prev_loc.fully_contains(&matching_input_offset_span)
}
};
if suppress {
return None;
}
// Not a duplicate! Turn the RawMatch into a BlobMatch
let m = BlobMatch {
rule,
blob,
matching_input: matching_input.as_bytes(),
matching_input_offset_span: matching_input_offset_span.clone(),
captures,
};
previous = Some((rule_id, matching_input_offset_span));
Some(m)
});
Ok(matches.collect())
}
}
// -------------------------------------------------------------------------------------------------
// test
// -------------------------------------------------------------------------------------------------
#[cfg(test)]
mod test {
use super::*;
use crate::rules::Rules;
use pretty_assertions::assert_eq;
#[test]
pub fn test_simple() -> Result<()> {
let rules = vec![Rule {
name: "test".to_string(),
pattern: "test".to_string(),
examples: vec![],
negative_examples: vec![],
references: vec![],
}];
let rules = Rules { rules };
let rules_db = RulesDatabase::from_rules(rules)?;
let input = "some test data for hyperscan";
let seen_blobs = BlobIdSet::new();
let mut matcher = Matcher::new(&rules_db, &seen_blobs, None)?;
matcher.scan_bytes_raw(input.as_bytes())?;
assert_eq!(
matcher.raw_matches_scratch,
vec![RawMatch {
rule_id: 0,
start_idx: 0,
end_idx: 9
},]
);
Ok(())
}
}