Skip to content

Commit

Permalink
Rethink regexp engine's backref-related compilation state.
Browse files Browse the repository at this point in the history
I had committer's remorse almost immediately after pushing cb76fbd,
upon finding that removing capturing subexpressions' subREs from the
data structure broke my proposed patch for REG_NOSUB optimization.
Revert that data structure change.  Instead, address the concern
about not changing capturing subREs' endpoints by not changing the
endpoints.  We don't need to, because the point of that bit was just
to ensure that the atom has endpoints distinct from the outer state
pair that we're stringing the branch between.  We already made
suitable states in the parenthesized-subexpression case, so the
additional ones were just useless overhead.  This seems more
understandable than Spencer's original coding, and it ought to be
a shade faster too by saving a few state creations and arc changes.
(I actually see a couple percent improvement on Jacobson's web
corpus, though that's barely above the noise floor so I wouldn't
put much stock in that result.)

Also, fix the logic added by ea1268f to ensure that the subRE
recorded in v->subs[subno] is exactly the one with capno == subno.
Spencer's original coding recorded the child subRE of the capture
node, which is okay so far as having the right endpoint states is
concerned, but as of cb76fbd the capturing subRE itself always
has those endpoints too.  I think the inconsistency is confusing
for the REG_NOSUB optimization.

As before, backpatch to v14.

Discussion: https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
  • Loading branch information
tglsfdc committed Aug 8, 2021
1 parent 75a2d13 commit 00116de
Showing 1 changed file with 47 additions and 37 deletions.
84 changes: 47 additions & 37 deletions src/backend/regex/regcomp.c
Expand Up @@ -233,13 +233,6 @@ static int cmp(const chr *, const chr *, size_t);
static int casecmp(const chr *, const chr *, size_t);


/* info we need during compilation about a known capturing subexpression */
struct subinfo
{
struct state *left; /* left end of its sub-NFA */
struct state *right; /* right end of its sub-NFA */
};

/* internal variables, bundled for easy passing around */
struct vars
{
Expand All @@ -252,10 +245,10 @@ struct vars
int nexttype; /* type of next token */
chr nextvalue; /* value (if any) of next token */
int lexcon; /* lexical context type (see regc_lex.c) */
int nsubexp; /* number of known capturing subexpressions */
struct subinfo *subs; /* info about known capturing subexpressions */
size_t nsubs; /* allocated length of subs[] vector */
struct subinfo sub10[10]; /* initial vector, enough for most */
int nsubexp; /* subexpression count */
struct subre **subs; /* subRE pointer vector */
size_t nsubs; /* length of vector */
struct subre *sub10[10]; /* initial vector, enough for most */
struct nfa *nfa; /* the NFA */
struct colormap *cm; /* character color map */
color nlcolor; /* color of newline */
Expand Down Expand Up @@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
v->subs = v->sub10;
v->nsubs = 10;
for (j = 0; j < v->nsubs; j++)
v->subs[j].left = v->subs[j].right = NULL;
v->subs[j] = NULL;
v->nfa = NULL;
v->cm = NULL;
v->nlcolor = COLORLESS;
Expand Down Expand Up @@ -511,35 +504,35 @@ pg_regcomp(regex_t *re,
}

/*
* moresubs - enlarge capturing-subexpressions vector
* moresubs - enlarge subRE vector
*/
static void
moresubs(struct vars *v,
int wanted) /* want enough room for this one */
{
struct subinfo *p;
struct subre **p;
size_t n;

assert(wanted > 0 && (size_t) wanted >= v->nsubs);
n = (size_t) wanted * 3 / 2 + 1;

if (v->subs == v->sub10)
{
p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
p = (struct subre **) MALLOC(n * sizeof(struct subre *));
if (p != NULL)
memcpy(VS(p), VS(v->subs),
v->nsubs * sizeof(struct subinfo));
v->nsubs * sizeof(struct subre *));
}
else
p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
if (p == NULL)
{
ERR(REG_ESPACE);
return;
}
v->subs = p;
for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
p->left = p->right = NULL;
*p = NULL;
assert(v->nsubs == n);
assert((size_t) wanted < v->nsubs);
}
Expand Down Expand Up @@ -988,6 +981,7 @@ parseqatom(struct vars *v,
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERRN();
/* We may not need these arcs, but keep things connected for now */
EMPTYARC(lp, s);
EMPTYARC(s2, rp);
NOERRN();
Expand All @@ -997,10 +991,6 @@ parseqatom(struct vars *v,
NOERRN();
if (cap)
{
/* save the sub-NFA's endpoints for future backrefs to use */
assert(v->subs[subno].left == NULL);
v->subs[subno].left = s;
v->subs[subno].right = s2;
if (atom->capno == 0)
{
/* normal case: just mark the atom as capturing */
Expand All @@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
t->child = atom;
atom = t;
}
assert(v->subs[subno] == NULL);
v->subs[subno] = atom;
}
/* postpone everything else pending possible {0} */
break;
case BACKREF: /* the Feature From The Black Lagoon */
INSIST(type != LACON, REG_ESUBREG);
INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
NOERRN();
assert(v->nextvalue > 0);
atom = subre(v, 'b', BACKR, lp, rp);
Expand Down Expand Up @@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
if (atom != NULL)
freesubre(v, atom);
if (atomtype == '(')
v->subs[subno].left = v->subs[subno].right = NULL;
v->subs[subno] = NULL;
delsub(v->nfa, lp, rp);
EMPTYARC(lp, rp);
return top;
Expand Down Expand Up @@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
NOERRN();
}

/*
* For what follows, we need the atom to have its own begin/end states
* that are distinct from lp/rp, so that we can wrap iteration structure
* around it. The parenthesized-atom case above already made suitable
* states (and we don't want to modify a capturing subre, since it's
* already recorded in v->subs[]). Otherwise, we need more states.
*/
if (atom->begin == lp || atom->end == rp)
{
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERRN();
moveouts(v->nfa, lp, s);
moveins(v->nfa, rp, s2);
atom->begin = s;
atom->end = s2;
}
else
{
/* The atom's OK, but we must temporarily disconnect it from lp/rp */
/* (this removes the EMPTY arcs we made above) */
delsub(v->nfa, lp, atom->begin);
delsub(v->nfa, atom->end, rp);
}

/*----------
* Prepare a general-purpose state skeleton.
*
* In the no-backrefs case, we want this:
*
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
* [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
*
* where prefix is some repetitions of atom. In the general case we need
* where prefix is some repetitions of atom, and "rest" is the remainder
* of the branch. In the general case we need:
*
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
*
* where the iterator wraps around [begin] ---atom---> [end]
* where the iterator wraps around the atom.
*
* We make the s state here for both cases; s2 is made below if needed
*----------
*/
s = newstate(v->nfa); /* first, new endpoints for the atom */
s2 = newstate(v->nfa);
NOERRN();
moveouts(v->nfa, lp, s);
moveins(v->nfa, rp, s2);
NOERRN();
atom->begin = s;
atom->end = s2;
s = newstate(v->nfa); /* set up starting state */
NOERRN();
EMPTYARC(lp, s);
Expand Down Expand Up @@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
{
assert(atom->begin->nouts == 1); /* just the EMPTY */
delsub(v->nfa, atom->begin, atom->end);
assert(v->subs[subno].left != NULL);
assert(v->subs[subno] != NULL);

/*
* And here's why the recursion got postponed: it must wait until the
* skeleton is filled in, because it may hit a backref that wants to
* copy the filled-in skeleton.
*/
dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
atom->begin, atom->end);
NOERRN();

Expand Down

0 comments on commit 00116de

Please sign in to comment.