diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index 6329d9a017fea..e6b5793fddafd 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -3676,6 +3676,12 @@ select * from test2 where t ~ ' z foo'; z foo bar (1 row) +select * from test2 where t ~ 'qua(?!foo)'; + t +------- + quark +(1 row) + drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); set enable_seqscan=off; @@ -3856,6 +3862,12 @@ select * from test2 where t ~ ' z foo'; z foo bar (1 row) +select * from test2 where t ~ 'qua(?!foo)'; + t +------- + quark +(1 row) + -- Check similarity threshold (bug #14202) CREATE TEMP TABLE restaurants (city text); INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000); diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index 6cddda2115bdb..eccd921d82c84 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -81,7 +81,9 @@ select * from test2 where t ~ 'z foo bar'; select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo'; +select * from test2 where t ~ 'qua(?!foo)'; drop index test2_idx_gin; + create index test2_idx_gist on test2 using gist (t gist_trgm_ops); set enable_seqscan=off; explain (costs off) @@ -116,6 +118,7 @@ select * from test2 where t ~ 'z foo bar'; select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo'; +select * from test2 where t ~ 'qua(?!foo)'; -- Check similarity threshold (bug #14202) diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c index 93da82286f3d5..43fbe98f7702a 100644 --- a/src/backend/regex/regexport.c +++ b/src/backend/regex/regexport.c @@ -6,8 +6,8 @@ * In this implementation, the NFA defines a necessary but not sufficient * condition for a string to match the regex: that is, there can be strings * that match the NFA but don't match the full regex, but not vice versa. - * Thus, for example, it is okay for the functions below to ignore lookaround - * constraints, which merely constrain the string some more. + * Thus, for example, it is okay for the functions below to treat lookaround + * constraints as no-ops, since they merely constrain the string some more. * * Notice that these functions return info into caller-provided arrays * rather than doing their own malloc's. This simplifies the APIs by @@ -28,6 +28,8 @@ #include "regex/regexport.h" +#include "miscadmin.h" + static void scancolormap(struct colormap * cm, int co, union tree * t, int level, chr partial, pg_wchar **chars, int *chars_len); @@ -76,29 +78,78 @@ pg_reg_getfinalstate(const regex_t *regex) } /* - * Get number of outgoing NFA arcs of state number "st". + * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON + * arcs from the caller, treating any LACON as being automatically satisfied. + * Since the output representation does not support arcs that consume no + * character when traversed, we have to recursively traverse LACON arcs here, + * and report whatever normal arcs are reachable by traversing LACON arcs. + * Note that this wouldn't work if it were possible to reach the final state + * via LACON traversal, but the regex library never builds NFAs that have + * LACON arcs leading directly to the final state. (This is because the + * regex executor is designed to consume one character beyond the nominal + * match end --- possibly an EOS indicator --- so there is always a set of + * ordinary arcs leading to the final state.) * - * Note: LACON arcs are ignored, both here and in pg_reg_getoutarcs(). + * traverse_lacons is a recursive subroutine used by both exported functions + * to count and then emit the reachable regular arcs. *arcs_count is + * incremented by the number of reachable arcs, and as many as will fit in + * arcs_len (possibly 0) are emitted into arcs[]. + */ +static void +traverse_lacons(struct cnfa * cnfa, int st, + int *arcs_count, + regex_arc_t *arcs, int arcs_len) +{ + struct carc *ca; + + /* + * Since this function recurses, it could theoretically be driven to stack + * overflow. In practice, this is mostly useful to backstop against a + * failure of the regex compiler to remove a loop of LACON arcs. + */ + check_stack_depth(); + + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + { + /* Ordinary arc, so count and possibly emit it */ + int ndx = (*arcs_count)++; + + if (ndx < arcs_len) + { + arcs[ndx].co = ca->co; + arcs[ndx].to = ca->to; + } + } + else + { + /* LACON arc --- assume it's satisfied and recurse... */ + /* ... but first, assert it doesn't lead directly to post state */ + Assert(ca->to != cnfa->post); + + traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len); + } + } +} + +/* + * Get number of outgoing NFA arcs of state number "st". */ int pg_reg_getnumoutarcs(const regex_t *regex, int st) { struct cnfa *cnfa; - struct carc *ca; - int count; + int arcs_count; assert(regex != NULL && regex->re_magic == REMAGIC); cnfa = &((struct guts *) regex->re_guts)->search; if (st < 0 || st >= cnfa->nstates) return 0; - count = 0; - for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) - { - if (ca->co < cnfa->ncolors) - count++; - } - return count; + arcs_count = 0; + traverse_lacons(cnfa, st, &arcs_count, NULL, 0); + return arcs_count; } /* @@ -111,24 +162,15 @@ pg_reg_getoutarcs(const regex_t *regex, int st, regex_arc_t *arcs, int arcs_len) { struct cnfa *cnfa; - struct carc *ca; + int arcs_count; assert(regex != NULL && regex->re_magic == REMAGIC); cnfa = &((struct guts *) regex->re_guts)->search; if (st < 0 || st >= cnfa->nstates || arcs_len <= 0) return; - for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) - { - if (ca->co < cnfa->ncolors) - { - arcs->co = ca->co; - arcs->to = ca->to; - arcs++; - if (--arcs_len == 0) - break; - } - } + arcs_count = 0; + traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len); } /*