From 180238a7abcaed291375fcf8173b49ad3b75d953 Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Thu, 16 Mar 2023 13:18:14 -0400 Subject: [PATCH 1/9] SL-18330: Refactor notation parsing to manage a lookahead char. Instead of peeking ahead by a character and then (most of the time) rereading it simply to discard it, go ahead and read the next character, passing it into the various LLSDNotationParser._parse_mumble() functions. This eliminates most LLSDNotationParser use of LLSDBaseParser._peek(), save for _get_re(). This eliminates the need for LLSDNotationParser._skip_then(), whose only job was to discard the next character and return the specified value. Break out LLSDNotationParser._parse_true() and _parse_false() in anticipation of potential reimplementation without _get_re(). --- llsd/serde_notation.py | 142 +++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 75 deletions(-) diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index 6802b7e..f4ca979 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -40,18 +40,18 @@ def __init__(self): b'{': self._parse_map, # array b'[': self._parse_array, - # undefined -- have to eat the '!' - b'!': lambda: self._skip_then(None), - # false -- have to eat the '0' - b'0': lambda: self._skip_then(False), - # true -- have to eat the '1' - b'1': lambda: self._skip_then(True), + # undefined + b'!': lambda cc: None, + # false + b'0': lambda cc: False, + # true + b'1': lambda cc: True, # false, must check for F|f|false|FALSE - b'F': lambda: self._get_re("'false'", _false_regex, False), - b'f': lambda: self._get_re("'false'", _false_regex, False), + b'F': self._parse_false, + b'f': self._parse_false, # true, must check for T|t|true|TRUE - b'T': lambda: self._get_re("'true'", _true_regex, True), - b't': lambda: self._get_re("'true'", _true_regex, True), + b'T': self._parse_true, + b't': self._parse_true, # 'i' = integer b'i': self._parse_integer, # 'r' = real number @@ -76,21 +76,23 @@ def __init__(self): for c, func in _dispatch_dict.items(): self._dispatch[ord(c)] = func - def parse(self, buffer, ignore_binary = False): + def parse(self, baseparser, ignore_binary = False): """ This is the basic public interface for parsing. - :param buffer: the notation string to parse. + :param baseparser: LLSDBaseParser or subclass holding data to parse. :param ignore_binary: parser throws away data in llsd binary nodes. :returns: returns a python object. """ - self._reset(buffer) + self._reset(baseparser) - # special case for notation: empty binary string means False - if not self._stream.peek(1): + # avoid self._getc() here because EOF is an acceptable result + cc = self._stream.read(1) + # special case for notation: empty input means False + if not cc: return False - return self._parse() + return self._parse(cc) def _get_until(self, delim): content = [] @@ -107,50 +109,43 @@ def _get_until(self, delim): # we've already consumed the close delim return b''.join(content) - def _skip_then(self, value): - # We've already _peek()ed at the current character, which is how we - # decided to call this method. Skip past it and return constant value. - self._getc() - return value - - def _get_re(self, desc, regex, override=None): + def _get_re(self, cc, desc, regex, override=None): # This is the case for which we introduced _peek(full=False). # Instead of trying to reimplement each of the re patterns passed to - # this method as individual operations on _util, peek ahead by a + # this method as individual operations on _stream, peek ahead by a # reasonable amount and directly use re. full=False means we're # willing to accept a result buffer shorter than our lookahead. + # Don't forget to prepend our lookahead character. # You would think we could parse int, real, True or False with fewer # bytes than this, but fuzz testing produces some real humdinger int # values. - peek = self._peek(80, full=False) + peek = cc + self._peek(80, full=False) match = regex.match(peek) if not match: self._error("Invalid %s token" % desc) else: - # skip what we matched - self._getc(match.end()) + # skip what we matched, adjusting for the char we already read + self._getc(match.end() - len(cc)) return override if override is not None else match.group(0) - def _parse(self): + def _parse(self, cc): "The notation parser workhorse." - cc = self._peek() try: func = self._dispatch[ord(cc)] except IndexError: # output error if the token was out of range self._error("Invalid notation token") else: - return func() + # pass the lookahead character that selected this func + return func(cc) - def _parse_binary(self): + def _parse_binary(self, cc): "parse a single binary object." - self._getc() # eat the beginning 'b' - cc = self._peek() + # skip the beginning 'b' + cc = self._getc() if cc == b'(': # parse raw binary - paren = self._getc() - # grab the 'expected' size of the binary data size = self._get_until(b')') if size == None: @@ -174,7 +169,7 @@ def _parse_binary(self): else: # get the encoding base - base = self._getc(2) + base = cc + self._getc() try: decoder = { b'16': base64.b16decode, @@ -201,7 +196,7 @@ def _parse_binary(self): # convert exception class so it's more catchable self._error("Bad binary data: " + str(exc)) - def _parse_map(self): + def _parse_map(self, cc): """ parse a single map @@ -210,97 +205,89 @@ def _parse_map(self): rv = {} key = b'' found_key = False - self._getc() # eat the beginning '{' - cc = self._peek() + # skip the beginning '{' + cc = self._getc() while (cc != b'}'): if cc is None: self._error("Unclosed map") if not found_key: if cc in (b"'", b'"', b's'): - key = self._parse_string() + key = self._parse_string(cc) found_key = True elif cc.isspace() or cc == b',': - self._getc() # eat the character + # ignore space or comma pass else: self._error("Invalid map key") elif cc.isspace(): - self._getc() # eat the space + # ignore space pass elif cc == b':': - self._getc() # eat the ':' - value = self._parse() + # skip the ':' + value = self._parse(self._getc()) rv[key] = value found_key = False else: self._error("missing separator") - cc = self._peek() - - if self._getc() != b'}': - self._error("Invalid map close token") + cc = self._getc() return rv - def _parse_array(self): + def _parse_array(self, cc): """ parse a single array. array: [ object, object, object ] """ rv = [] - self._getc() # eat the beginning '[' - cc = self._peek() + # skip the beginning '[' + cc = self._getc() while (cc != b']'): if cc is None: self._error('Unclosed array') if cc.isspace() or cc == b',': - self._getc() - cc = self._peek() + cc = self._getc() continue - rv.append(self._parse()) - cc = self._peek() + rv.append(self._parse(cc)) + cc = self._getc() - if self._getc() != b']': - self._error("Invalid array close token") return rv - def _parse_uuid(self): + def _parse_uuid(self, cc): "Parse a uuid." - self._getc() # eat the beginning 'u' + # ignore the beginning 'u' # see comment on LLSDNotationFormatter.UUID() re use of latin-1 return uuid.UUID(hex=self._getc(36).decode('latin-1')) - def _parse_uri(self): + def _parse_uri(self, cc): "Parse a URI." - self._getc() # eat the beginning 'l' - return uri(self._parse_string()) + # skip the beginning 'l' + return uri(self._parse_string(self._getc())) - def _parse_date(self): + def _parse_date(self, cc): "Parse a date." - self._getc() # eat the beginning 'd' - datestr = self._parse_string() + # skip the beginning 'd' + datestr = self._parse_string(self._getc()) return _parse_datestr(datestr) - def _parse_real(self): + def _parse_real(self, cc): "Parse a floating point number." - self._getc() # eat the beginning 'r' - return float(self._get_re("real", _real_regex)) + # ignore the beginning 'r' + return float(self._get_re(b'', "real", _real_regex)) - def _parse_integer(self): + def _parse_integer(self, cc): "Parse an integer." - self._getc() # eat the beginning 'i' - return int(self._get_re("integer", _int_regex)) + # ignore the beginning 'i' + return int(self._get_re(b'', "integer", _int_regex)) - def _parse_string(self): + def _parse_string(self, delim): """ Parse a string string: "g\'day" | 'have a "nice" day' | s(size)"raw data" """ rv = "" - delim = self._peek() if delim in (b"'", b'"'): - delim = self._getc() # eat the beginning delim rv = self._parse_string_delim(delim) elif delim == b's': rv = self._parse_string_raw() @@ -315,7 +302,6 @@ def _parse_string_raw(self): string: s(size)"raw data" """ - self._getc() # eat the beginning 's' # Read the (size) portion. cc = self._getc() if cc != b'(': @@ -339,6 +325,12 @@ def _parse_string_raw(self): except UnicodeDecodeError as exc: raise LLSDParseError(exc) + def _parse_true(self, cc): + return self._get_re(cc, "'true'", _true_regex, True) + + def _parse_false(self, cc): + return self._get_re(cc, "'false'", _false_regex, False) + class LLSDNotationFormatter(LLSDBaseFormatter): """ From 66164b49ebb438571b022729b5c7248236c9f1b0 Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Thu, 16 Mar 2023 16:42:57 -0400 Subject: [PATCH 2/9] SL-18330: Parse notation integers "by hand" instead of with regex. To further reduce LLSDNotationParser's reliance on _get_re(), change _parse_integer() to read individual characters until it sees a delimiter -- instead of using _peek(). Make LLSDBaseParser._getc() support full=False, allowing a short read when explicitly specified. This is useful for when an LLSD int ends at EOF. Add LLSDBaseParser._putback(), a convenience wrapper around seek(). --- llsd/base.py | 10 ++++++++-- llsd/serde_notation.py | 36 ++++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/llsd/base.py b/llsd/base.py index aaabb54..c937a9a 100644 --- a/llsd/base.py +++ b/llsd/base.py @@ -450,12 +450,18 @@ def _next_nonblank(self): c = self._stream.read(1) return c - def _getc(self, num=1): + def _getc(self, num=1, full=True): got = self._stream.read(num) - if len(got) < num: + if full and len(got) < num: self._error("Trying to read past end of stream") return got + def _putback(self, num=1): + # if either of these tests fail, it's not a user error, it's a coding error + assert num > 0 + assert self._stream.tell() >= num + self._stream.seek(-num, io.SEEK_CUR) + def _peek(self, num=1, full=True): # full=True means error if we can't peek ahead num bytes if num < 0: diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index f4ca979..7ebe861 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -7,7 +7,6 @@ LLSDParseError, LLSDSerializationError, UnicodeType, _format_datestr, _parse_datestr, _str_to_bytes, binary, uri) -_int_regex = re.compile(br"[-+]?\d+") _real_regex = re.compile(br"[-+]?(?:(\d+(\.\d*)?|\d*\.\d+)([eE][-+]?\d+)?)|[-+]?inf|[-+]?nan") _true_regex = re.compile(br"TRUE|true|\b[Tt]\b") _false_regex = re.compile(br"FALSE|false|\b[Ff]\b") @@ -71,7 +70,7 @@ def __init__(self): } # Like LLSDBinaryParser, construct a lookup list from this dict. Start # by filling with the 'else' case. - self._dispatch = 256*[lambda: self._error("Invalid notation token")] + self._dispatch = 256*[lambda cc: self._error("Invalid notation token")] # Then fill in specific entries based on the dict above. for c, func in _dispatch_dict.items(): self._dispatch[ord(c)] = func @@ -86,8 +85,8 @@ def parse(self, baseparser, ignore_binary = False): """ self._reset(baseparser) - # avoid self._getc() here because EOF is an acceptable result - cc = self._stream.read(1) + # EOF is an acceptable result + cc = self._getc(full=False) # special case for notation: empty input means False if not cc: return False @@ -116,10 +115,10 @@ def _get_re(self, cc, desc, regex, override=None): # reasonable amount and directly use re. full=False means we're # willing to accept a result buffer shorter than our lookahead. # Don't forget to prepend our lookahead character. - # You would think we could parse int, real, True or False with fewer - # bytes than this, but fuzz testing produces some real humdinger int + # You would think we could parse real, True or False with fewer bytes + # than this, but fuzz testing produces some real humdinger float # values. - peek = cc + self._peek(80, full=False) + peek = cc + self._peek(30, full=False) match = regex.match(peek) if not match: self._error("Invalid %s token" % desc) @@ -278,7 +277,28 @@ def _parse_real(self, cc): def _parse_integer(self, cc): "Parse an integer." # ignore the beginning 'i' - return int(self._get_re(b'', "integer", _int_regex)) + cc = self._getc() + sign = 1 + if cc == b'+': + cc = self._getc() + elif cc == b'-': + sign = -1 + cc = self._getc() + + digits = [] + while cc.isdigit(): + digits.append(cc) + # we can accept EOF happening here + cc = self._getc(full=False) + + # cc is now the next _getc() after the last digit -- back up + if cc: + self._putback() + + if not digits: + self._error('Invalid integer token') + + return sign * int(b''.join(digits)) def _parse_string(self, delim): """ From 16ca726fe11fe1b9a3d3b0aadd90b4bdf6d10391 Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Thu, 16 Mar 2023 17:20:00 -0400 Subject: [PATCH 3/9] SL-18330: Parse notation true/false without using _get_re() and hence without using LLSDBaseParser._peek(). --- llsd/serde_notation.py | 46 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index 7ebe861..13cca4a 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -8,8 +8,6 @@ _format_datestr, _parse_datestr, _str_to_bytes, binary, uri) _real_regex = re.compile(br"[-+]?(?:(\d+(\.\d*)?|\d*\.\d+)([eE][-+]?\d+)?)|[-+]?inf|[-+]?nan") -_true_regex = re.compile(br"TRUE|true|\b[Tt]\b") -_false_regex = re.compile(br"FALSE|false|\b[Ff]\b") class LLSDNotationParser(LLSDBaseParser): @@ -346,10 +344,50 @@ def _parse_string_raw(self): raise LLSDParseError(exc) def _parse_true(self, cc): - return self._get_re(cc, "'true'", _true_regex, True) + # match t, T, true, TRUE -- not mixed-case + try: + rest = {b't': b'rue', b'T': b'RUE'}[cc] + except KeyError: + self._error("Invalid 'true' token") + + cc = self._getc(full=False) + # beware, rest is bytes, so bytes[0] is an int! + if cc != rest[:1]: + # just 't' or 'T' is legal, put back cc and carry on + if cc: + self._putback() + return True + + # saw 'tr' or 'TR', cc is the 'r' + tail = self._getc(len(rest)-1) + # 'tr' MUST be followed by 'ue' + if tail != rest[1:]: + self._error("Invalid 'true' token") + # good, it is + return True def _parse_false(self, cc): - return self._get_re(cc, "'false'", _false_regex, False) + # match f, F, false, FALSE -- not mixed-case + try: + rest = {b'f': b'alse', b'F': b'ALSE'}[cc] + except KeyError: + self._error("Invalid 'false' token") + + cc = self._getc(full=False) + # beware, rest is bytes, so bytes[0] is an int! + if cc != rest[:1]: + # just 'f' or 'F' is legal, put back cc and carry on + if cc: + self._putback() + return False + + # saw 'fa' or 'FA', cc is the 'a' + tail = self._getc(len(rest)-1) + # 'fa' MUST be followed by 'lse' + if tail != rest[1:]: + self._error("Invalid 'false' token") + # good, it is + return False class LLSDNotationFormatter(LLSDBaseFormatter): From 228103e9d91cb27685dbbc6e20c2d0f42d351bff Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Fri, 17 Mar 2023 11:52:51 -0400 Subject: [PATCH 4/9] SL-18330: Eliminate LLSDBaseParser._peek() from serde_binary.py. LLSDBinaryParser._parse_array() does not actually depend on peek() to check for close bracket ']': it has an explicit element count. --- llsd/serde_binary.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py index 42a2c0d..4f45ab2 100644 --- a/llsd/serde_binary.py +++ b/llsd/serde_binary.py @@ -7,6 +7,14 @@ _str_to_bytes, binary, is_integer, is_string, uri) +try: + # Python 2 + xrange +except NameError: + # Python 3 + xrange = range + + class LLSDBinaryParser(LLSDBaseParser): """ Parse application/llsd+binary to a python object. @@ -106,15 +114,10 @@ def _parse_array(self): "Parse a single llsd array" rv = [] size = struct.unpack("!i", self._getc(4))[0] - count = 0 - cc = self._peek() - while (cc != b']') and (count < size): + for count in xrange(size): rv.append(self._parse()) - count += 1 - cc = self._peek() - if cc != b']': + if self._getc() != b']': self._error("invalid array close token") - self._getc() return rv def _parse_string(self): From 6591194fe750e2f6e3a0eb0ee0a012fbef3092bf Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Fri, 17 Mar 2023 12:00:19 -0400 Subject: [PATCH 5/9] SL-18330: Eliminate _peek() call from LLSDBaseParser. The only _peek() call was in _error(). Once _error() reads a character, we no longer care about the read position of _stream. --- llsd/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llsd/base.py b/llsd/base.py index c937a9a..8e11c04 100644 --- a/llsd/base.py +++ b/llsd/base.py @@ -498,7 +498,7 @@ def _error(self, message, offset=0): # 'offset' is relative to current pos self._stream.seek(offset, io.SEEK_CUR) raise LLSDParseError("%s at byte %d: %r" % - (message, oldpos+offset, self._peek(1, full=False))) + (message, oldpos+offset, self._getc(1, full=False))) # map char following escape char to corresponding character _escaped = { From f5aab9f51864766485302e778b0feaa3924014a4 Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Fri, 17 Mar 2023 15:28:55 -0400 Subject: [PATCH 6/9] SL-18330: Remove peek() calls entirely from LLSD parsing. Remove LLSDBaseParser._peek(). All parsers now use only _getc(), with occasional _putback(), which uses seek(). Remove LLSDNotationParser._get_re(), the last consumer of _peek(). Remove serde_notation._real_regex, the last usage of _get_re(). Recast _parse_real(), the last caller of _get_re(), to scan digits et al. like a lexer. Break out _collect_sign() and _collect_digits() from _parse_integer(); these are called multiple times in new _parse_real() logic. Break out parameterized _parse_bool() from _parse_true() and _parse_false(), eliminating redundant implementations. Further break out _expect() from _parse_bool(); this is also called by new _parse_real() logic. Make LLSDBaseParser._putback() accept the bytes string it's supposed to put back, but only for its length: sometimes we call _putback() when we've reached EOF, in which case the passed string is empty. This avoids having to replicate 'if cc: self._putback()' logic everywhere. Refine LLSDBaseParser._reset() wrapper logic to wrap an existing stream in io.BufferedReader only if it's not already seekable(). That includes an incoming bytes object: wrap it only in BytesIO, which is seekable(). --- llsd/base.py | 49 ++---------- llsd/serde_notation.py | 164 ++++++++++++++++++++++------------------- 2 files changed, 97 insertions(+), 116 deletions(-) diff --git a/llsd/base.py b/llsd/base.py index 8e11c04..80d8ba7 100644 --- a/llsd/base.py +++ b/llsd/base.py @@ -372,13 +372,12 @@ def _reset(self, something): # Wrap an incoming bytes string into a stream. If the passed bytes # string is so large that the overhead of copying it into a # BytesIO is significant, advise caller to pass a stream instead. - # BytesIO has no peek() method, so wrap it in BufferedReader. - self._stream = io.BufferedReader(io.BytesIO(something)) - elif hasattr(something, 'peek'): - # 'something' is already a buffered stream, use directly + self._stream = io.BytesIO(something) + elif something.seekable(): + # 'something' is already a seekable stream, use directly self._stream = something else: - # 'something' isn't buffered, wrap in BufferedReader + # 'something' isn't seekable, wrap in BufferedReader # (let BufferedReader handle the problem of passing an # inappropriate object) self._stream = io.BufferedReader(something) @@ -456,42 +455,10 @@ def _getc(self, num=1, full=True): self._error("Trying to read past end of stream") return got - def _putback(self, num=1): - # if either of these tests fail, it's not a user error, it's a coding error - assert num > 0 - assert self._stream.tell() >= num - self._stream.seek(-num, io.SEEK_CUR) - - def _peek(self, num=1, full=True): - # full=True means error if we can't peek ahead num bytes - if num < 0: - # There aren't many ways this can happen. The likeliest is that - # we've just read garbage length bytes from a binary input string. - # We happen to know that lengths are encoded as 4 bytes, so back - # off by 4 bytes to try to point the user at the right spot. - self._error("Invalid length field %d" % num, -4) - - # Instead of using self._stream.peek() at all, use read(num) and reset - # the read pointer. BufferedReader.peek() does not promise to return - # the requested length, but does not clarify the conditions under - # which it returns fewer bytes. - # https://docs.python.org/3/library/io.html#io.BufferedReader.peek - # In practice, we've seen it fail with an input file up over 100Kb: - # peek() returns only part of what we requested, but because we also - # passed full=False (see LLSDNotationParser._get_re()), we didn't - # notice and the parse failed looking for a map delimiter halfway - # through a large decimal integer. read(num), on the other hand, - # promises to return num bytes until actual EOF. - oldpos = self._stream.tell() - try: - got = self._stream.read(num) - if full and len(got) < num: - self._error("Trying to peek past end of stream") - - return got - - finally: - self._stream.seek(oldpos) + def _putback(self, cc): + # if this test fails, it's not a user error, it's a coding error + assert self._stream.tell() >= len(cc) + self._stream.seek(-len(cc), io.SEEK_CUR) def _error(self, message, offset=0): oldpos = self._stream.tell() diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index 13cca4a..f6ed21d 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -7,8 +7,6 @@ LLSDParseError, LLSDSerializationError, UnicodeType, _format_datestr, _parse_datestr, _str_to_bytes, binary, uri) -_real_regex = re.compile(br"[-+]?(?:(\d+(\.\d*)?|\d*\.\d+)([eE][-+]?\d+)?)|[-+]?inf|[-+]?nan") - class LLSDNotationParser(LLSDBaseParser): """ @@ -106,25 +104,6 @@ def _get_until(self, delim): # we've already consumed the close delim return b''.join(content) - def _get_re(self, cc, desc, regex, override=None): - # This is the case for which we introduced _peek(full=False). - # Instead of trying to reimplement each of the re patterns passed to - # this method as individual operations on _stream, peek ahead by a - # reasonable amount and directly use re. full=False means we're - # willing to accept a result buffer shorter than our lookahead. - # Don't forget to prepend our lookahead character. - # You would think we could parse real, True or False with fewer bytes - # than this, but fuzz testing produces some real humdinger float - # values. - peek = cc + self._peek(30, full=False) - match = regex.match(peek) - if not match: - self._error("Invalid %s token" % desc) - else: - # skip what we matched, adjusting for the char we already read - self._getc(match.end() - len(cc)) - return override if override is not None else match.group(0) - def _parse(self, cc): "The notation parser workhorse." try: @@ -269,34 +248,82 @@ def _parse_date(self, cc): def _parse_real(self, cc): "Parse a floating point number." - # ignore the beginning 'r' - return float(self._get_re(b'', "real", _real_regex)) + # recognize: + # [+-]?inf + # [+-]?nan + # [+-]?basepart([eE][+-]?\d+)? + # where basepart could be either: + # \d+(\.\d*)? or + # \d*\.\d+ + digits = [] + # skip the beginning 'r' + cc = self._collect_sign(self._getc(), digits) + try: + rest = {b'i': b'nf', b'n': b'an'}[cc] + except KeyError: + # cc is neither 'i' nor 'n', must be a digit: + # collect integer digits + idigits = [] + fdigits = [] + edigits = [] + cc = self._collect_digits(cc, idigits) + digits.extend(idigits) + if cc == b'.': + digits.append(cc) + # skip decimal point and collect fractional digits + cc = self._collect_digits(self._getc(full=False), fdigits) + digits.extend(fdigits) + # Fun fact: (cc in b'eE') is True even when cc is b''! + if cc in (b'e', b'E'): + digits.append(cc) + # skip 'e' and check for exponent sign + cc = self._collect_sign(self._getc(), digits) + cc = self._collect_digits(cc, edigits) + digits.extend(edigits) + if not edigits: + # if 'e' is present, there MUST be an exponent + self._error('Invalid real exponent') + # Whether this real number ended after the integer part, after the + # decimal point, after the fractional part or after the exponent, + # cc is now one character PAST the end -- put it back. + self._putback(cc) + # The reason we collected idigits and fdigits separately is that + # while either may be empty, they may not BOTH be empty. + if not (idigits or fdigits): + self._error('Invalid real number') + else: + # cc is either 'i' for 'inf' or 'n' for 'nan', + # rest is 'nf' or 'an' + digits.extend([cc, self._expect(cc + rest, rest)]) + + return float(b''.join(digits)) def _parse_integer(self, cc): "Parse an integer." - # ignore the beginning 'i' - cc = self._getc() - sign = 1 - if cc == b'+': - cc = self._getc() - elif cc == b'-': - sign = -1 + digits = [] + # skip the beginning 'i' + cc = self._collect_sign(self._getc(), digits) + cc = self._collect_digits(cc, digits) + if not digits: + self._error('Invalid integer token') + + # cc is now the next _getc() after the last digit -- back up + self._putback(cc) + + return int(b''.join(digits)) + + def _collect_sign(self, cc, digits): + if cc in (b'+', b'-'): + digits.append(cc) cc = self._getc() + return cc - digits = [] + def _collect_digits(self, cc, digits): while cc.isdigit(): digits.append(cc) # we can accept EOF happening here cc = self._getc(full=False) - - # cc is now the next _getc() after the last digit -- back up - if cc: - self._putback() - - if not digits: - self._error('Invalid integer token') - - return sign * int(b''.join(digits)) + return cc def _parse_string(self, delim): """ @@ -345,49 +372,36 @@ def _parse_string_raw(self): def _parse_true(self, cc): # match t, T, true, TRUE -- not mixed-case - try: - rest = {b't': b'rue', b'T': b'RUE'}[cc] - except KeyError: - self._error("Invalid 'true' token") - - cc = self._getc(full=False) - # beware, rest is bytes, so bytes[0] is an int! - if cc != rest[:1]: - # just 't' or 'T' is legal, put back cc and carry on - if cc: - self._putback() - return True - - # saw 'tr' or 'TR', cc is the 'r' - tail = self._getc(len(rest)-1) - # 'tr' MUST be followed by 'ue' - if tail != rest[1:]: - self._error("Invalid 'true' token") - # good, it is - return True + return self._parse_bool(cc, True, (b'true', b'TRUE')) def _parse_false(self, cc): # match f, F, false, FALSE -- not mixed-case + return self._parse_bool(cc, False, (b'false', b'FALSE')) + + def _parse_bool(self, cc, result, tokens): try: - rest = {b'f': b'alse', b'F': b'ALSE'}[cc] + # Index on first character to find expected rest. + # Beware, token is bytes, so token[0] is an int! + rest = {token[:1]: token[1:] for token in tokens}[cc] except KeyError: - self._error("Invalid 'false' token") + self._error("Invalid '%s' token" % tokens[0]) cc = self._getc(full=False) - # beware, rest is bytes, so bytes[0] is an int! if cc != rest[:1]: - # just 'f' or 'F' is legal, put back cc and carry on - if cc: - self._putback() - return False - - # saw 'fa' or 'FA', cc is the 'a' - tail = self._getc(len(rest)-1) - # 'fa' MUST be followed by 'lse' - if tail != rest[1:]: - self._error("Invalid 'false' token") - # good, it is - return False + # legal to have only first char, put back cc and carry on + self._putback(cc) + return result + + # saw 'tr' or 'TR' (or 'fa' or 'FA'), cc is the second char: + # MUST be followed by the rest of 'rest' + self._expect(tokens[0], rest[1:]) + return result + + def _expect(self, token, match): + # verify that the next several chars are exactly what we expect + if self._getc(len(match), full=False) != match: + self._error("Invalid '%s' token" % token) + return match class LLSDNotationFormatter(LLSDBaseFormatter): From d31981da491a2ec852b3a456cc57753ae9d0c7fe Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Mon, 20 Mar 2023 15:03:46 -0400 Subject: [PATCH 7/9] SL-18330: Update LLSD{Binary,Notation}Parser.parse() docstrings. The description of the parameter to parse didn't enumerate all acceptable forms. --- llsd/serde_binary.py | 7 ++++--- llsd/serde_notation.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py index 4f45ab2..0b53196 100644 --- a/llsd/serde_binary.py +++ b/llsd/serde_binary.py @@ -63,15 +63,16 @@ def __init__(self): for c, func in _dispatch_dict.items(): self._dispatch[ord(c)] = func - def parse(self, buffer, ignore_binary = False): + def parse(self, something, ignore_binary = False): """ This is the basic public interface for parsing. - :param buffer: the binary data to parse in an indexable sequence. + :param something: serialized LLSD to parse: a bytes object, a binary + stream or an LLSDBaseParser subclass. :param ignore_binary: parser throws away data in llsd binary nodes. :returns: returns a python object. """ - self._reset(buffer) + self._reset(something) self._keep_binary = not ignore_binary try: return self._parse() diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index f6ed21d..d88f3a0 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -71,15 +71,16 @@ def __init__(self): for c, func in _dispatch_dict.items(): self._dispatch[ord(c)] = func - def parse(self, baseparser, ignore_binary = False): + def parse(self, something, ignore_binary = False): """ This is the basic public interface for parsing. - :param baseparser: LLSDBaseParser or subclass holding data to parse. + :param something: serialized LLSD to parse: a bytes object, a binary + stream or an LLSDBaseParser subclass. :param ignore_binary: parser throws away data in llsd binary nodes. :returns: returns a python object. """ - self._reset(baseparser) + self._reset(something) # EOF is an acceptable result cc = self._getc(full=False) From e53c051396ac9d9108c3aba20023593d12dd9a68 Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Mon, 20 Mar 2023 16:25:30 -0400 Subject: [PATCH 8/9] SL-18330: Use xml.etree.ElementTree.fromstring() when we can. Apparently, wrapping an incoming bytes object in io.BytesIO and passing it to xml.etree.ElementTree.parse() is slower than passing the bytes object directly to fromstring(). Detect the BytesIO case and call fromstring() with contents. --- llsd/serde_xml.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index 2f048a0..99d39b8 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -1,4 +1,5 @@ import base64 +import io import re import types @@ -264,8 +265,15 @@ def parse_xml_nohdr(baseparser): # before XML declaration. Since we explicitly test support for that case, # skip initial whitespace. baseparser.matchseq(b'') + stream = baseparser.remainder() try: - element = _parse(baseparser.remainder()).getroot() + if isinstance(stream, io.BytesIO): + # Empirically, fromstring() seems faster than _parse(). If passed + # a BytesIO, extract its contents and skip to BytesIO read pos. + element = fromstring(stream.getvalue()[stream.tell():]) + else: + # Not a BytesIO, parse the stream + element = _parse(stream).getroot() except ElementTreeError as err: raise LLSDParseError(*err.args) From 9249fcee5c66b4f44d602eafd09569806d78195e Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Tue, 21 Mar 2023 17:09:15 -0400 Subject: [PATCH 9/9] SL-18330: Unify Py 2 and 3 on 'range()' not 'xrange()' --- llsd/serde_binary.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py index 9b0e1ee..6f0d93e 100644 --- a/llsd/serde_binary.py +++ b/llsd/serde_binary.py @@ -9,11 +9,11 @@ try: - # Python 2 - xrange + # Python 2: make 'range()' lazy like Python 3 + range = xrange except NameError: - # Python 3 - xrange = range + # Python 3: 'range()' is already lazy + pass class LLSDBinaryParser(LLSDBaseParser): @@ -116,7 +116,7 @@ def _parse_array(self): "Parse a single llsd array" rv = [] size = struct.unpack("!i", self._getc(4))[0] - for count in xrange(size): + for count in range(size): rv.append(self._parse()) if self._getc() != b']': self._error("invalid array close token")