Skip to content

Commit

Permalink
Limit maximum allowed NFA and DFA size.
Browse files Browse the repository at this point in the history
Instead of failing with an out of memory exception or crashing with a
stack overflow, emit an error message and exit. This is a partial fix
for bug #394 "Stack overflow due to recursion in src/dfa/dead_rules.cc",
where re2c hit stack overflow on a counted repetition regexp with high
upper bound.

The patch adds the following limits:
  1. the number of NFA states
  2. NFA depth (maximum length of a non-looping path from start to end)
  3. the number of DFA states
  3. total DFA size (sum total of all NFA substates in all DFA states)

There are tests for the first three limits, but not for the DFA size as
all examples that trigger this behavior take a long time to finish (a
few seconds), which increases test run time almost twice.
  • Loading branch information
skvadrik committed Jan 21, 2022
1 parent aa02264 commit a3473fd
Show file tree
Hide file tree
Showing 15 changed files with 130 additions and 66 deletions.
9 changes: 6 additions & 3 deletions lib/regcomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,13 @@ int regcomp(regex_t *preg, const char *pattern, int cflags)
RESpec re(arv, opt, msg, *preg->rmgr);

find_fixed_tags(re);

insert_default_tags(re);

nfa_t *nfa = new nfa_t(re);
size_t nfa_size, nfa_depth;
compute_size_and_depth(re.res, &nfa_size, &nfa_depth);
if (nfa_depth > MAX_NFA_DEPTH || nfa_size > MAX_NFA_STATES) return 1;

nfa_t *nfa = new nfa_t(re, nfa_size);

nfa_t *nfa0 = NULL;
if (cflags & REG_BACKWARD) {
Expand All @@ -69,7 +72,7 @@ int regcomp(regex_t *preg, const char *pattern, int cflags)
Opt opts0(globopts0, msg);
const opt_t *opt0 = opts0.snapshot();
RESpec re0(arv, opt0, msg, *preg->rmgr);
nfa0 = new nfa_t(re0);
nfa0 = new nfa_t(re0, nfa_size);
delete opt0;
}

Expand Down
12 changes: 11 additions & 1 deletion src/compile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,17 @@ static smart_ptr<DFA> ast_to_dfa(const spec_t &spec, Output &output)
insert_default_tags(re);
warn_nullable(re, cond);

nfa_t nfa(re);
size_t nfa_size, nfa_depth;
compute_size_and_depth(re.res, &nfa_size, &nfa_depth);
if (nfa_depth > MAX_NFA_DEPTH) {
error("NFA depth exceeds limits");
exit(1);
} else if (nfa_size > MAX_NFA_STATES) {
error("NFA has too many states");
exit(1);
}

nfa_t nfa(re, nfa_size);
DDUMP_NFA(opts, nfa);

dfa_t dfa(nfa, spec.def_rule, spec.eof_rule);
Expand Down
7 changes: 6 additions & 1 deletion src/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,12 @@ enum DirConf {
DCONF_SEPARATOR = 2u
};

const uint32_t NOEOF = ~0u - 1;
static const uint32_t NOEOF = ~0u - 1;

static const size_t MAX_NFA_DEPTH = 1000 * 1000;
static const size_t MAX_NFA_STATES = 1000 * 1000 * 100;
static const size_t MAX_DFA_STATES = 1000 * 100;
static const size_t MAX_DFA_SIZE = 1000 * 1000 * 50;

} // namespace re2c

Expand Down
17 changes: 14 additions & 3 deletions src/dfa/determinization.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ void determinization(ctx_t &ctx)
tagged_epsilon_closure(ctx);
find_state(ctx);

// iterate while new kernels are added: for each alphabet symbol,
// build tagged epsilon-closure of all reachable NFA states,
// then find identical or mappable DFA state or add a new one
// Iterate while new kernels are added: for each alphabet symbol, build tagged
// epsilon-closure of all reachable NFA states, then find identical or mappable DFA
// state or add a new one.
for (uint32_t i = 0; i < ctx.dc_kernels.size(); ++i) {
ctx.dc_origin = i;
clear_caches(ctx);
Expand All @@ -88,6 +88,16 @@ void determinization(ctx_t &ctx)
reach_on_symbol(ctx, c);
tagged_epsilon_closure(ctx);
find_state(ctx);

// Abort if DFA grows too fast (either in the number of states, or in the
// total size of all state kernels which may have many NFA substates).
if (ctx.dc_kernels.size() > MAX_DFA_STATES) {
error("DFA has too many states");
exit(1);
} else if (ctx.kernels_total > MAX_DFA_SIZE) {
error("DFA is too large");
exit(1);
}
}
}

Expand Down Expand Up @@ -252,6 +262,7 @@ determ_context_t<history_t>::determ_context_t(const opt_t *opts, Msg &msg
, dc_tagvertbl(nfa.tags.size())
, history()
, dc_kernels()
, kernels_total(0)
, dc_buffers()
, dc_hc_caches()
, dc_newvers(newver_cmp_t<history_t>(history, dc_hc_caches))
Expand Down
1 change: 1 addition & 0 deletions src/dfa/determinization.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ struct determ_context_t
tagver_table_t dc_tagvertbl;
history_t history; // prefix trie of tag histories
kernels_t dc_kernels; // TDFA states under construction
size_t kernels_total; // sum total of all kernel sizes
kernel_buffers_t dc_buffers;
hc_caches_t dc_hc_caches; // per-tag cache of history comparisons
newvers_t dc_newvers; // map of triples (tag, version, history) to new version
Expand Down
1 change: 1 addition & 0 deletions src/dfa/find_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ bool do_find_state(ctx_t &ctx)
// otherwise add new kernel
kernel_t *kcopy = make_kernel_copy<stadfa>(k, ctx.dc_allocator);
ctx.dc_target = kernels.push(hash, kcopy);
ctx.kernels_total += k->size;
return true;
}

Expand Down
101 changes: 47 additions & 54 deletions src/nfa/estimate_size.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,19 @@ namespace re2c {
namespace {

struct StackItem {
const RE *re; // current sub-RE
uint32_t size; // size of the sub-RE (only for alternative and concatenation)
uint8_t succ; // index of the next sucessor to be visited
const RE *re; // current sub-RE
uint32_t size; // RE size (only for alternative and concatenation)
uint32_t depth; // RE depth (only for alternative and concatenation)
uint8_t succ; // index of the next sucessor to be visited
};

static uint32_t estimate_re_size(const RE *re0, std::vector<StackItem> &stack)
{
// the estimated size of the last sub-RE visited by DFS
uint32_t size = 0;
static void compute_re_size_and_depth(
const RE *re0, std::vector<StackItem> &stack, size_t *psize, size_t *pdepth) {

const StackItem i0 = {re0, 0, 0};
// the estimated size and depth of the last sub-RE visited by DFS
uint32_t size = 0, depth = 0;

const StackItem i0 = {re0, 0, 0, 0};
stack.push_back(i0);

while (!stack.empty()) {
Expand All @@ -30,91 +32,82 @@ static uint32_t estimate_re_size(const RE *re0, std::vector<StackItem> &stack)

const RE *re = i.re;
if (re->type == RE::NIL) {
size = 0;
}
else if (re->type == RE::SYM || re->type == RE::TAG) {
size = 1;
}
else if (re->type == RE::ALT) {
size = depth = 0;
} else if (re->type == RE::SYM || re->type == RE::TAG) {
size = depth = 1;
} else if (re->type == RE::ALT) {
if (i.succ == 0) {
// recurse into the left sub-RE
StackItem k = {re, 0, 1};
StackItem k = {re, 0, 0, 1};
stack.push_back(k);
StackItem j = {re->alt.re1, 0, 0};
StackItem j = {re->alt.re1, 0, 0, 0};
stack.push_back(j);
}
else if (i.succ == 1) {
} else if (i.succ == 1) {
// recurse into the right sub-RE
StackItem k = {re, size, 2};
StackItem k = {re, size, depth, 2};
stack.push_back(k);
StackItem j = {re->alt.re2, 0, 0};
StackItem j = {re->alt.re2, 0, 0, 0};
stack.push_back(j);
}
else {
} else {
// both sub-RE visited, recursive return
size = i.size // left sub-RE (saved on stack)
+ size // right sub-RE (just visited by DFS)
+ 1; // additional state for alternative
// (left one is on stack, right one was just visited by DFS)
size = 1 + i.size + size;
depth = 1 + std::max(i.depth, depth);
}
}
else if (re->type == RE::CAT) {
} else if (re->type == RE::CAT) {
if (i.succ == 0) {
// recurse into the left sub-RE
StackItem k = {re, 0, 1};
StackItem k = {re, 0, 0, 1};
stack.push_back(k);
StackItem j = {re->cat.re1, 0, 0};
StackItem j = {re->cat.re1, 0, 0, 0};
stack.push_back(j);
}
else if (i.succ == 1) {
} else if (i.succ == 1) {
// recurse into the right sub-RE
StackItem k = {re, size, 2};
StackItem k = {re, size, depth, 2};
stack.push_back(k);
StackItem j = {re->cat.re2, 0, 0};
StackItem j = {re->cat.re2, 0, 0, 0};
stack.push_back(j);
}
else {
} else {
// both sub-RE visited, recursive return
size = i.size // left sub-RE (saved on stack)
+ size; // right sub-RE (just visited by DFS)
// (left one is on stack, right one was just visited by DFS)
size = i.size + size;
depth = i.depth + depth;
}
}
else if (re->type == RE::ITER) {
} else if (re->type == RE::ITER) {
if (i.succ == 0) {
// recurse into the sub-RE
StackItem k = {re, 0, 1};
StackItem k = {re, 0, 0, 1};
stack.push_back(k);
StackItem j = {re->iter.re, 0, 0};
StackItem j = {re->iter.re, 0, 0, 0};
stack.push_back(j);
}
else {
} else {
// sub-RE visited, recursive return
// formula is the same for size and depth (it reflects NFA construction)
const uint32_t min = re->iter.min, max = re->iter.max;
size = max == AST::MANY
? size * min + 1
: size * max + (max - min);
size = max == AST::MANY ? size * min + 1 : size * max + (max - min);
depth = max == AST::MANY ? depth * min + 1 : depth * max + (max - min);
}
}
}

DASSERT(stack.empty());
return size;

*psize = *psize + size + 1;
*pdepth = std::max(*pdepth, static_cast<size_t>(depth));
}

} // anonymous namespace

size_t estimate_size(const std::vector<RE*> &res)
{
void compute_size_and_depth(const std::vector<RE*> &res, size_t *psize, size_t *pdepth) {
std::vector<StackItem> stack;

const size_t nre = res.size();
DASSERT(nre > 0);
size_t size = nre - 1;
*psize = nre - 1;
*pdepth = 0;

for (size_t i = 0; i < nre; ++i) {
size += estimate_re_size(res[i], stack) + 1;
compute_re_size_and_depth(res[i], stack, psize, pdepth);
}

return size;
}

} // namespace re2c
4 changes: 2 additions & 2 deletions src/nfa/nfa.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,15 @@ struct nfa_t
nfa_state_t *root;
uint32_t ncores;

explicit nfa_t(const RESpec &spec);
nfa_t(const RESpec &spec, size_t max_size);
~nfa_t();

FORBID_COPY(nfa_t);
};

static const uint32_t NONCORE = ~0u;

size_t estimate_size(const std::vector<RE*> &res);
void compute_size_and_depth(const std::vector<RE*> &res, size_t *psize, size_t *pdepth);

} // namespace re2c

Expand Down
4 changes: 2 additions & 2 deletions src/nfa/re_to_nfa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@ static uint32_t stats(nfa_state_t *root)
return ncores;
}

nfa_t::nfa_t(const RESpec &spec)
: max_size(estimate_size(spec.res))
nfa_t::nfa_t(const RESpec &spec, size_t max_size)
: max_size(max_size)
, size(0)
, states(new nfa_state_t[max_size])
, charset(spec.charset)
Expand Down
1 change: 1 addition & 0 deletions test/messages/large_regexp_01.c
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
re2c: error: NFA depth exceeds limits
6 changes: 6 additions & 0 deletions test/messages/large_regexp_01.re
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// re2c $INPUT

// NFA depth exceeds limits
/*!re2c
((((([a]{10}){10}){10}){10}){10}){10} [b] {}
*/
1 change: 1 addition & 0 deletions test/messages/large_regexp_02.c
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
re2c: error: NFA has too many states
25 changes: 25 additions & 0 deletions test/messages/large_regexp_02.re
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// re2c $INPUT

// NFA has too many states
/*!re2c
x = [a]{1,10};
y = x|x|x|x|x|x|x|x|x|x|
x|x|x|x|x|x|x|x|x|x|
x|x|x|x|x|x|x|x|x|x|
x|x|x|x|x|x|x|x|x|x|
x|x|x|x|x|x|x|x|x|x|
x|x|x|x|x|x|x|x|x|x|
x|x|x|x|x|x|x|x|x|x|
x|x|x|x|x|x|x|x|x|x;
z = y{1,10};
w = z|z|z|z|z|z|z|z|z|z|
z|z|z|z|z|z|z|z|z|z|
z|z|z|z|z|z|z|z|z|z|
z|z|z|z|z|z|z|z|z|z|
z|z|z|z|z|z|z|z|z|z|
z|z|z|z|z|z|z|z|z|z|
z|z|z|z|z|z|z|z|z|z|
z|z|z|z|z|z|z|z|z|z;
u = w{100};
u {}
*/
1 change: 1 addition & 0 deletions test/messages/large_regexp_03.c
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
re2c: error: DFA has too many states
6 changes: 6 additions & 0 deletions test/messages/large_regexp_03.re
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// re2c $INPUT

// DFA has too many states
/*!re2c
((((([a]{10}){10}){10}){10}){10}){10} {}
*/

0 comments on commit a3473fd

Please sign in to comment.