Skip to content

Commit

Permalink
fix repeat of preferred empty match
Browse files Browse the repository at this point in the history
Ports fix of golang/go#46123 to RE2/J.

Original go fix:

golang/go@2a61b3c

Change description from that commit:

In Perl mode, (|a)* should match an empty string at the start of the
input. Instead it matches as many a's as possible.
Because (|a)+ is handled correctly, matching only an empty string,
this leads to the paradox that e* can match more text than e+
(for e = (|a)) and that e+ is sometimes different from ee*.

The current code treats e* and e+ as the same structure, with
different entry points. In the case of e* the preference list ends up
not quite in the right order, in part because the "before e" and
"after e" states are the same state. Splitting them apart fixes the
preference list, and that can be done by compiling e* as if it were
(e+)?.

Fixes google#136.
  • Loading branch information
sjamesr committed May 27, 2021
1 parent 2bf9ce9 commit c8ff00d
Show file tree
Hide file tree
Showing 8 changed files with 197 additions and 47 deletions.
38 changes: 29 additions & 9 deletions java/com/google/re2j/Compiler.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Compiler {
private static class Frag {
final int i; // an instruction address (pc).
int out; // a patch list; see explanation in Prog.java
boolean nullable; // whether the fragment can match the empty string

Frag() {
this(0, 0);
Expand All @@ -34,8 +35,13 @@ private static class Frag {
}

Frag(int i, int out) {
this(i, out, false);
}

Frag(int i, int out, boolean nullable) {
this.i = i;
this.out = out;
this.nullable = nullable;
}
}

Expand All @@ -56,7 +62,7 @@ static Prog compileRegexp(Regexp re) {
private Frag newInst(int op) {
// TODO(rsc): impose length limit.
prog.addInst(op);
return new Frag(prog.numInst() - 1);
return new Frag(prog.numInst() - 1, 0, true);
}

// Returns a no-op fragment. Sometimes unavoidable.
Expand Down Expand Up @@ -90,7 +96,7 @@ private Frag cat(Frag f1, Frag f2) {
}
// TODO(rsc): elide nop
prog.patch(f1.out, f2.i);
return new Frag(f1.i, f2.out);
return new Frag(f1.i, f2.out, f1.nullable && f2.nullable);
}

// Given fragments for a and b, returns fragment for a|b.
Expand All @@ -107,11 +113,16 @@ private Frag alt(Frag f1, Frag f2) {
i.out = f1.i;
i.arg = f2.i;
f.out = prog.append(f1.out, f2.out);
f.nullable = f1.nullable || f2.nullable;
return f;
}

// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
private Frag quest(Frag f1, boolean nongreedy) {
// loop returns the fragment for the main loop of a plus or star.
// For plus, it can be used directly. with f1.i as the entry.
// For star, it can be used directly when f1 can't match an empty string.
// (When f1 can match an empty string, f1* must be implemented as (f1+)?
// to get the priority match order correct.)
private Frag loop(Frag f1, boolean nongreedy) {
Frag f = newInst(Inst.ALT);
Inst i = prog.getInst(f.i);
if (nongreedy) {
Expand All @@ -121,12 +132,12 @@ private Frag quest(Frag f1, boolean nongreedy) {
i.out = f1.i;
f.out = f.i << 1 | 1;
}
f.out = prog.append(f.out, f1.out);
prog.patch(f1.out, f.i);
return f;
}

// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
private Frag star(Frag f1, boolean nongreedy) {
// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
private Frag quest(Frag f1, boolean nongreedy) {
Frag f = newInst(Inst.ALT);
Inst i = prog.getInst(f.i);
if (nongreedy) {
Expand All @@ -136,13 +147,21 @@ private Frag star(Frag f1, boolean nongreedy) {
i.out = f1.i;
f.out = f.i << 1 | 1;
}
prog.patch(f1.out, f.i);
f.out = prog.append(f.out, f1.out);
return f;
}

// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
private Frag star(Frag f1, boolean nongreedy) {
if (f1.nullable) {
return quest(plus(f1, nongreedy), nongreedy);
}
return loop(f1, nongreedy);
}

// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
private Frag plus(Frag f1, boolean nongreedy) {
return new Frag(f1.i, star(f1, nongreedy).out);
return new Frag(f1.i, loop(f1, nongreedy).out, f1.nullable);
}

// op is a bitmask of EMPTY_* flags.
Expand All @@ -160,6 +179,7 @@ private Frag rune(int rune, int flags) {
// flags : parser flags
private Frag rune(int[] runes, int flags) {
Frag f = newInst(Inst.RUNE);
f.nullable = false;
Inst i = prog.getInst(f.i);
i.runes = runes;
flags &= RE2.FOLD_CASE; // only relevant flag is FoldCase
Expand Down
10 changes: 9 additions & 1 deletion javatests/com/google/re2j/FindTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ public String toString() {
35,
35,
36),
new Test("(|a)*", "aa", 3, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2),
};

@Parameters
Expand Down Expand Up @@ -538,7 +539,14 @@ private void testSubmatch(String testName, Test test, int n, String[] result) {
System.err.println(testName + " " + test + " " + n + " " + k + " ");
String expect = test.submatchString(n, k / 2);
if (!expect.equals(result[k / 2])) {
fail(String.format("%s %d: expected %s got %s: %s", testName, n, expect, result, test));
fail(
String.format(
"%s %d: expected %s got %s: %s",
testName,
n,
expect,
Arrays.toString(result),
test));
}
}
}
Expand Down
19 changes: 19 additions & 0 deletions javatests/com/google/re2j/ProgTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,25 @@ public class ProgTest {
"(?:(?:^).)",
"0 fail\n" + "1* empty 4 -> 2\n" + "2 anynotnl -> 3\n" + "3 match\n"
},
{
"(?:|a)+",
"0 fail\n"
+ "1 nop -> 4\n"
+ "2 rune1 \"a\" -> 4\n"
+ "3* alt -> 1, 2\n"
+ "4 alt -> 3, 5\n"
+ "5 match\n"
},
{
"(?:|a)*",
"0 fail\n"
+ "1 nop -> 4\n"
+ "2 rune1 \"a\" -> 4\n"
+ "3 alt -> 1, 2\n"
+ "4 alt -> 3, 6\n"
+ "5* alt -> 3, 6\n"
+ "6 match\n"
},
};

private final String input;
Expand Down
1 change: 1 addition & 0 deletions javatests/com/google/re2j/SimplifyTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import static org.junit.Assert.assertEquals;

import com.google.common.truth.Truth;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
Expand Down
12 changes: 4 additions & 8 deletions testdata/basic.dat
Original file line number Diff line number Diff line change
Expand Up @@ -124,24 +124,20 @@ E ((a)) abc (0,1)(0,1)(0,1)
E (a)b(c) abc (0,3)(0,1)(2,3)
E a+b+c aabbabc (4,7)
E a* aaa (0,3)
#E (a*)* - (0,0)(0,0)
E (a*)* - (0,0)(?,?) RE2/Go
E (a*)* - (0,0)(0,0)
E (a*)+ - (0,0)(0,0)
#E (a*|b)* - (0,0)(0,0)
E (a*|b)* - (0,0)(?,?) RE2/Go
E (a*|b)* - (0,0)(0,0)
E (a+|b)* ab (0,2)(1,2)
E (a+|b)+ ab (0,2)(1,2)
E (a+|b)? ab (0,1)(0,1)
BE [^ab]* cde (0,3)
#E (^)* - (0,0)(0,0)
E (^)* - (0,0)(?,?) RE2/Go
E (^)* - (0,0)(0,0)
BE a* NULL (0,0)
E ([abc])*d abbbcd (0,6)(4,5)
E ([abc])*bcd abcd (0,4)(0,1)
E a|b|c|d|e e (0,1)
E (a|b|c|d|e)f ef (0,2)(0,1)
#E ((a*|b))* - (0,0)(0,0)(0,0)
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
E ((a*|b))* - (0,0)(0,0)(0,0)
BE abcd*efg abcdefg (0,7)
BE ab* xabyabbbz (1,3)
BE ab* xayabbbz (1,2)
Expand Down
18 changes: 6 additions & 12 deletions testdata/nullsubexpr.dat
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
NOTE null subexpression matches : 2002-06-06

E (a*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a*)+ a (0,1)(0,1)
Expand All @@ -19,17 +18,15 @@ E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)

E ([a]*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([^b]*)* a (0,1)(0,1)
#E SAME b (0,0)(0,0)
E SAME b (0,0)(?,?) RE2/Go
E SAME b (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaab (0,6)(0,6)
E ([ab]*)* a (0,1)(0,1)
Expand All @@ -41,11 +38,9 @@ E SAME bbbbbb (0,6)(0,6)
E SAME aaaabcde (0,5)(0,5)
E ([^a]*)* b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
#E SAME aaaaaa (0,0)(0,0)
E SAME aaaaaa (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,0)(0,0)
E ([^ab]*)* ccccxx (0,6)(0,6)
#E SAME ababab (0,0)(0,0)
E SAME ababab (0,0)(?,?) RE2/Go
E SAME ababab (0,0)(0,0)

E ((z)+|a)* zabcde (0,2)(1,2)

Expand All @@ -65,8 +60,7 @@ B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)

#E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) ax (0,2)(0,1)(1,2)
E (a*)*(x) axa (0,2)(0,1)(1,2)

Expand Down
Binary file modified testdata/re2-exhaustive.txt.gz
Binary file not shown.

0 comments on commit c8ff00d

Please sign in to comment.