Skip to content

Commit

Permalink
Merge pull request #217 from rust-lang-nursery/fix-capture-perf
Browse files Browse the repository at this point in the history
Add known upper limit to capture search.
  • Loading branch information
BurntSushi committed Apr 27, 2016
2 parents 4471212 + 49e8df5 commit 3f408e5
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 15 deletions.
36 changes: 21 additions & 15 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

use std::cell::RefCell;
use std::collections::HashMap;
use std::cmp;
use std::sync::Arc;

use thread_local::CachedThreadLocal;
Expand All @@ -27,6 +28,7 @@ use re_bytes;
use re_trait::{RegularExpression, Slot};
use re_unicode;
use set;
use utf8::next_utf8;

/// Exec manages the execution of a regular expression.
///
Expand Down Expand Up @@ -253,17 +255,7 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
fn slots_len(&self) -> usize { self.0.slots_len() }

fn next_after_empty(&self, text: &str, i: usize) -> usize {
let b = text.as_bytes()[i];
let inc = if b <= 0x7F {
1
} else if b <= 0b110_11111 {
2
} else if b <= 0b1110_1111 {
3
} else {
4
};
i + inc
next_utf8(text.as_bytes(), i)
}

#[inline(always)] // reduces constant overhead
Expand Down Expand Up @@ -433,15 +425,29 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
}
match self.ro.match_type {
MatchType::Literal(ty) => {
self.exec_literals(ty, text, start).and_then(|(s, _)| {
self.captures_nfa(MatchNfaType::Auto, slots, text, s)
self.exec_literals(ty, text, start).and_then(|(s, e)| {
// We need the +1 here to account for lookahead
// operators.
let e = if self.ro.nfa.uses_bytes() {
cmp::min(e + 1, text.len())
} else {
cmp::min(next_utf8(text, e), text.len())
};
self.captures_nfa(MatchNfaType::Auto, slots, &text[..e], s)
})
}
MatchType::Dfa => {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, _)) => {
dfa::Result::Match((s, e)) => {
// We need the +1 here to account for lookahead
// operators.
let e = if self.ro.nfa.uses_bytes() {
cmp::min(e + 1, text.len())
} else {
cmp::min(next_utf8(text, e), text.len())
};
self.captures_nfa(
MatchNfaType::Auto, slots, text, s)
MatchNfaType::Auto, slots, &text[..e], s)
}
dfa::Result::NoMatch => None,
dfa::Result::Quit => {
Expand Down
19 changes: 19 additions & 0 deletions src/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,25 @@ const TAG_TWO: u8 = 0b1100_0000;
const TAG_THREE: u8 = 0b1110_0000;
const TAG_FOUR: u8 = 0b1111_0000;

/// Returns the smallest possible index of the next valid UTF-8 sequence
/// starting after `i`.
pub fn next_utf8(text: &[u8], i: usize) -> usize {
let b = match text.get(i) {
None => return i + 1,
Some(&b) => b,
};
let inc = if b <= 0x7F {
1
} else if b <= 0b110_11111 {
2
} else if b <= 0b1110_1111 {
3
} else {
4
};
i + inc
}

/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
///
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
Expand Down

0 comments on commit 3f408e5

Please sign in to comment.