Skip to content

Commit

Permalink
internal/encoding/text/decode: stop using regexp
Browse files Browse the repository at this point in the history
This eliminates the last user of the regexp package, which should save
about 130K from the resulting stripped binary importing this package
(unless, of course, regexp is brought in directly of via another
dependency).

Added some new cases to TestDecoder to test the new function.

Benchmark (not included) shows the following results, comparing to
old implementation using regexp.Find:

name     old time/op    new time/op    delta
ErrId-4    1.93µs ± 1%    0.21µs ± 1%   -89.20%  (p=0.002 n=6+6)

name     old alloc/op   new alloc/op   delta
ErrId-4      128B ± 0%        0B       -100.00%  (p=0.002 n=6+6)

name     old allocs/op  new allocs/op  delta
ErrId-4      13.0 ± 0%       0.0       -100.00%  (p=0.002 n=6+6)

Change-Id: I5569a47580f41cc60f92c444e8d43bb3f26faa4e
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/402774
Reviewed-by: Cassondra Foesch <cfoesch@gmail.com>
Reviewed-by: Damien Neil <dneil@google.com>
Reviewed-by: Lasse Folger <lassefolger@google.com>
  • Loading branch information
kolyshkin authored and neild committed May 16, 2022
1 parent 8a7ba07 commit a048235
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 5 deletions.
26 changes: 21 additions & 5 deletions internal/encoding/text/decode.go
Expand Up @@ -8,7 +8,6 @@ import (
"bytes"
"fmt"
"io"
"regexp"
"strconv"
"unicode/utf8"

Expand Down Expand Up @@ -421,7 +420,7 @@ func (d *Decoder) parseFieldName() (tok Token, err error) {
return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
}

return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in))
return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
}

// parseTypeName parses Any type URL or extension field name. The name is
Expand Down Expand Up @@ -571,7 +570,7 @@ func (d *Decoder) parseScalar() (Token, error) {
return tok, nil
}

return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in))
return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
}

// parseLiteralValue parses a literal value. A literal value is used for
Expand Down Expand Up @@ -653,8 +652,25 @@ func consume(b []byte, n int) []byte {
return b
}

// Any sequence that looks like a non-delimiter (for error reporting).
var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
// errId extracts a byte sequence that looks like an invalid ID
// (for the purposes of error reporting).
func errId(seq []byte) []byte {
for i := 0; i < len(seq); {
r, size := utf8.DecodeRune(seq[i:])
if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
if i == 0 {
// Either the first byte is invalid UTF-8 or a
// delimiter, or the first rune is non-ASCII.
// Return it as-is.
i = size
}
return seq[:i:i]
}
i += size
}
// No delimiter found.
return seq
}

// isDelim returns true if given byte is a delimiter character.
func isDelim(c byte) bool {
Expand Down
24 changes: 24 additions & 0 deletions internal/encoding/text/decode_test.go
Expand Up @@ -380,6 +380,30 @@ func TestDecoder(t *testing.T) {
in: "123name",
want: []R{{E: "invalid field name: 123name"}},
},
{
in: `/`,
want: []R{{E: `invalid field name: /`}},
},
{
in: `世界`,
want: []R{{E: `invalid field name: 世`}},
},
{
in: `1a/b`,
want: []R{{E: `invalid field name: 1a`}},
},
{
in: `1c\d`,
want: []R{{E: `invalid field name: 1c`}},
},
{
in: "\x84f",
want: []R{{E: "invalid field name: \x84"}},
},
{
in: "\uFFFDxxx",
want: []R{{E: "invalid field name: \uFFFD"}},
},
{
in: "[type]",
want: []R{
Expand Down

0 comments on commit a048235

Please sign in to comment.