internal/encoding/text/decode: stop using regexp

This eliminates the last user of the regexp package, which should save about 130K from the resulting stripped binary importing this package (unless, of course, regexp is brought in directly of via another dependency). Added some new cases to TestDecoder to test the new function. Benchmark (not included) shows the following results, comparing to old implementation using regexp.Find: name old time/op new time/op delta ErrId-4 1.93µs ± 1% 0.21µs ± 1% -89.20% (p=0.002 n=6+6) name old alloc/op new alloc/op delta ErrId-4 128B ± 0% 0B -100.00% (p=0.002 n=6+6) name old allocs/op new allocs/op delta ErrId-4 13.0 ± 0% 0.0 -100.00% (p=0.002 n=6+6) Change-Id: I5569a47580f41cc60f92c444e8d43bb3f26faa4e Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/402774 Reviewed-by: Cassondra Foesch <cfoesch@gmail.com> Reviewed-by: Damien Neil <dneil@google.com> Reviewed-by: Lasse Folger <lassefolger@google.com>
protocolbuffers · May 16, 2022 · a048235 · a048235
1 parent 8a7ba07
commit a048235
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 5 deletions.
diff --git a/internal/encoding/text/decode.go b/internal/encoding/text/decode.go
@@ -8,7 +8,6 @@ import (
 	"bytes"
 	"fmt"
 	"io"
-	"regexp"
 	"strconv"
 	"unicode/utf8"
 
@@ -421,7 +420,7 @@ func (d *Decoder) parseFieldName() (tok Token, err error) {
 		return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
 	}
 
-	return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in))
+	return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
 }
 
 // parseTypeName parses Any type URL or extension field name. The name is
@@ -571,7 +570,7 @@ func (d *Decoder) parseScalar() (Token, error) {
 		return tok, nil
 	}
 
-	return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in))
+	return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
 }
 
 // parseLiteralValue parses a literal value. A literal value is used for
@@ -653,8 +652,25 @@ func consume(b []byte, n int) []byte {
 	return b
 }
 
-// Any sequence that looks like a non-delimiter (for error reporting).
-var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
+// errId extracts a byte sequence that looks like an invalid ID
+// (for the purposes of error reporting).
+func errId(seq []byte) []byte {
+	for i := 0; i < len(seq); {
+		r, size := utf8.DecodeRune(seq[i:])
+		if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
+			if i == 0 {
+				// Either the first byte is invalid UTF-8 or a
+				// delimiter, or the first rune is non-ASCII.
+				// Return it as-is.
+				i = size
+			}
+			return seq[:i:i]
+		}
+		i += size
+	}
+	// No delimiter found.
+	return seq
+}
 
 // isDelim returns true if given byte is a delimiter character.
 func isDelim(c byte) bool {

diff --git a/internal/encoding/text/decode_test.go b/internal/encoding/text/decode_test.go
@@ -380,6 +380,30 @@ func TestDecoder(t *testing.T) {
 			in:   "123name",
 			want: []R{{E: "invalid field name: 123name"}},
 		},
+		{
+			in:   `/`,
+			want: []R{{E: `invalid field name: /`}},
+		},
+		{
+			in:   `世界`,
+			want: []R{{E: `invalid field name: 世`}},
+		},
+		{
+			in:   `1a/b`,
+			want: []R{{E: `invalid field name: 1a`}},
+		},
+		{
+			in:   `1c\d`,
+			want: []R{{E: `invalid field name: 1c`}},
+		},
+		{
+			in:   "\x84f",
+			want: []R{{E: "invalid field name: \x84"}},
+		},
+		{
+			in:   "\uFFFDxxx",
+			want: []R{{E: "invalid field name: \uFFFD"}},
+		},
 		{
 			in: "[type]",
 			want: []R{