Skip to content

Commit

Permalink
Add known upper limit to capture search.
Browse files Browse the repository at this point in the history
The DFA will report the end location of a match, so we should pass that
along to capture detection. In theory, the DFA and the NFA report the
same match locations, so this upper bound shouldn't be necessary---the
NFA should quit once it finds the right match. It turns out though
bounding the text has two important ramifications:

1. It will enable the backtracking engine to be used more often. In
particular, the backtracking engine can only be used on small inputs and
this change decreases the size of the input by only considering the
match.
2. The backtracking engine must start every search by zeroing memory
that is proportional to the size of the input. If the input is smaller,
then this runs more quickly.

We are also careful to bound the match to one additional "character"
past the end of the match, so that lookahead operators work correctly.
  • Loading branch information
BurntSushi committed Apr 27, 2016
1 parent 4471212 commit 7084f14
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 13 deletions.
25 changes: 12 additions & 13 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

use std::cell::RefCell;
use std::collections::HashMap;
use std::cmp;
use std::sync::Arc;

use thread_local::CachedThreadLocal;
Expand All @@ -27,6 +28,7 @@ use re_bytes;
use re_trait::{RegularExpression, Slot};
use re_unicode;
use set;
use utf8::next_utf8;

/// Exec manages the execution of a regular expression.
///
Expand Down Expand Up @@ -253,17 +255,7 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
fn slots_len(&self) -> usize { self.0.slots_len() }

fn next_after_empty(&self, text: &str, i: usize) -> usize {
let b = text.as_bytes()[i];
let inc = if b <= 0x7F {
1
} else if b <= 0b110_11111 {
2
} else if b <= 0b1110_1111 {
3
} else {
4
};
i + inc
next_utf8(text.as_bytes(), i)
}

#[inline(always)] // reduces constant overhead
Expand Down Expand Up @@ -439,9 +431,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
}
MatchType::Dfa => {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, _)) => {
dfa::Result::Match((s, e)) => {
// We need the +1 here to account for lookahead
// operators.
let e = if self.ro.nfa.uses_bytes() {
cmp::min(e + 1, text.len())
} else {
cmp::min(next_utf8(text, e), text.len())
};
self.captures_nfa(
MatchNfaType::Auto, slots, text, s)
MatchNfaType::Auto, slots, &text[..e], s)
}
dfa::Result::NoMatch => None,
dfa::Result::Quit => {
Expand Down
19 changes: 19 additions & 0 deletions src/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,25 @@ const TAG_TWO: u8 = 0b1100_0000;
const TAG_THREE: u8 = 0b1110_0000;
const TAG_FOUR: u8 = 0b1111_0000;

/// Returns the smallest possible index of the next valid UTF-8 sequence
/// starting after `i`.
pub fn next_utf8(text: &[u8], i: usize) -> usize {
let b = match text.get(i) {
None => return i + 1,
Some(&b) => b,
};
let inc = if b <= 0x7F {
1
} else if b <= 0b110_11111 {
2
} else if b <= 0b1110_1111 {
3
} else {
4
};
i + inc
}

/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
///
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
Expand Down

0 comments on commit 7084f14

Please sign in to comment.