-
Notifications
You must be signed in to change notification settings - Fork 1
Speed up parsing notation LLSD #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2f252df
6d1ce89
147e7ea
3f91c7b
2764149
6bb156a
7330f31
5abdc32
232902c
1bb6812
2325efb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -141,6 +141,19 @@ def B(fmt): | |
return fmt | ||
|
||
|
||
class PY3SemanticBytes(BytesType): | ||
"""Wrapper to make `buffer[n]` return an integer like in Py3""" | ||
__slots__ = [] | ||
|
||
def __getitem__(self, item): | ||
ret = super(PY3SemanticBytes, self).__getitem__(item) | ||
# `buffer[n]` should return an integer, but slice syntax like | ||
# `buffer[n:n+1]` should still return a `Bytes` object as before. | ||
if is_integer(item): | ||
return ord(ret) | ||
return ret | ||
|
||
|
||
def is_integer(o): | ||
""" portable test if an object is like an int """ | ||
return isinstance(o, IntTypes) | ||
|
@@ -321,19 +334,6 @@ def _to_python(node): | |
return NODE_HANDLERS[node.tag](node) | ||
|
||
|
||
def _hex_as_nybble(hex): | ||
"Accepts a single hex character and returns a nybble." | ||
if (hex >= b'0') and (hex <= b'9'): | ||
return ord(hex) - ord(b'0') | ||
elif (hex >= b'a') and (hex <=b'f'): | ||
return 10 + ord(hex) - ord(b'a') | ||
elif (hex >= b'A') and (hex <=b'F'): | ||
return 10 + ord(hex) - ord(b'A') | ||
else: | ||
raise LLSDParseError('Invalid hex character: %s' % hex) | ||
|
||
|
||
|
||
class LLSDBaseFormatter(object): | ||
""" | ||
This base class cannot be instantiated on its own: it assumes a subclass | ||
|
@@ -366,13 +366,22 @@ def __init__(self): | |
} | ||
|
||
|
||
_X_ORD = ord(b'x') | ||
_BACKSLASH_ORD = ord(b'\\') | ||
SaladDais marked this conversation as resolved.
Show resolved
Hide resolved
|
||
_DECODE_BUFF_ALLOC_SIZE = 1024 | ||
|
||
|
||
class LLSDBaseParser(object): | ||
""" | ||
Utility methods useful for parser subclasses. | ||
""" | ||
__slots__ = ['_buffer', '_index', '_decode_buff'] | ||
|
||
def __init__(self): | ||
self._buffer = b'' | ||
self._index = 0 | ||
self._index = 0 | ||
# Scratch space for decoding delimited strings | ||
self._decode_buff = bytearray(_DECODE_BUFF_ALLOC_SIZE) | ||
|
||
def _error(self, message, offset=0): | ||
try: | ||
|
@@ -399,53 +408,85 @@ def _getc(self, num=1): | |
|
||
# map char following escape char to corresponding character | ||
_escaped = { | ||
b'a': b'\a', | ||
b'b': b'\b', | ||
b'f': b'\f', | ||
b'n': b'\n', | ||
b'r': b'\r', | ||
b't': b'\t', | ||
b'v': b'\v', | ||
ord(b'a'): ord(b'\a'), | ||
ord(b'b'): ord(b'\b'), | ||
ord(b'f'): ord(b'\f'), | ||
ord(b'n'): ord(b'\n'), | ||
ord(b'r'): ord(b'\r'), | ||
ord(b't'): ord(b'\t'), | ||
ord(b'v'): ord(b'\v'), | ||
} | ||
|
||
def _parse_string_delim(self, delim): | ||
"Parse a delimited string." | ||
parts = bytearray() | ||
found_escape = False | ||
found_hex = False | ||
found_digit = False | ||
byte = 0 | ||
insert_idx = 0 | ||
delim_ord = ord(delim) | ||
# Preallocate a working buffer for the decoded string output | ||
# to avoid allocs in the hot loop. | ||
decode_buff = self._decode_buff | ||
# Cache these in locals, otherwise we have to perform a lookup on | ||
# `self` in the hot loop. | ||
buff = self._buffer | ||
read_idx = self._index | ||
cc = 0 | ||
while True: | ||
cc = self._getc() | ||
if found_escape: | ||
if found_hex: | ||
if found_digit: | ||
found_escape = False | ||
found_hex = False | ||
found_digit = False | ||
byte <<= 4 | ||
byte |= _hex_as_nybble(cc) | ||
parts.append(byte) | ||
byte = 0 | ||
try: | ||
cc = buff[read_idx] | ||
read_idx += 1 | ||
|
||
if cc == _BACKSLASH_ORD: | ||
# Backslash, figure out if this is an \xNN hex escape or | ||
# something like \t | ||
cc = buff[read_idx] | ||
read_idx += 1 | ||
if cc == _X_ORD: | ||
# It's a hex escape. char is the value of the two | ||
# following hex nybbles. This slice may result in | ||
# a short read (0 or 1 bytes), but either a | ||
# `ValueError` will be triggered by the first case, | ||
# and the second will cause an `IndexError` on the | ||
# next iteration of the loop. | ||
hex_bytes = buff[read_idx:read_idx + 2] | ||
read_idx += 2 | ||
try: | ||
# int() can parse a `bytes` containing hex, | ||
# no explicit `bytes.decode("ascii")` required. | ||
cc = int(hex_bytes, 16) | ||
except ValueError as e: | ||
# One of the hex characters was likely invalid. | ||
# Wrap the ValueError so that we can provide a | ||
# byte offset in the error. | ||
self._index = read_idx | ||
self._error(e, offset=-2) | ||
else: | ||
found_digit = True | ||
byte = _hex_as_nybble(cc) | ||
elif cc == b'x': | ||
found_hex = True | ||
else: | ||
found_escape = False | ||
# escape char preceding anything other than the chars in | ||
# _escaped just results in that same char without the | ||
# escape char | ||
parts.extend(self._escaped.get(cc, cc)) | ||
elif cc == b'\\': | ||
found_escape = True | ||
elif cc == delim: | ||
break | ||
else: | ||
parts.extend(cc) | ||
# escape char preceding anything other than the chars | ||
# in _escaped just results in that same char without | ||
# the escape char | ||
cc = self._escaped.get(cc, cc) | ||
elif cc == delim_ord: | ||
break | ||
except IndexError: | ||
# We can be reasonably sure that any IndexErrors inside here | ||
# were caused by an out-of-bounds `buff[read_idx]`. | ||
self._index = read_idx | ||
self._error("Trying to read past end of buffer") | ||
SaladDais marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
try: | ||
decode_buff[insert_idx] = cc | ||
except IndexError: | ||
# Oops, that overflowed the decoding buffer, make a | ||
# new expanded buffer containing the existing contents. | ||
decode_buff = bytearray(decode_buff) | ||
decode_buff.extend(b"\x00" * _DECODE_BUFF_ALLOC_SIZE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be faster still to do something like:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it depends on how many oversized strings you expect in the input stream. The code you submitted leaves you with a single larger buffer, prepared to handle many oversized strings without further allocations. My suggestion above should expand faster the first time, but requires consolidating multiple I still suggest the code you wrote would be improved by catching Say you're working on the second oversized string in the same input stream, so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Whoops, not true: your expanded There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yep, my reasoning there was I sometimes use long-lived
That is a good point! I forgot that in Python it's much less expensive to handle the (potential) The buffer copy itself isn't terribly expensive relative to juggling + concatenating multiple buffers though. Buffer concatenation was a tiny bit slower for payloads containing strings mostly under 1024 bytes. |
||
decode_buff[insert_idx] = cc | ||
|
||
insert_idx += 1 | ||
|
||
# Sync our local read index with the canonical one | ||
self._index = read_idx | ||
try: | ||
return parts.decode('utf-8') | ||
# Slice off only what we used of the working decode buffer | ||
return decode_buff[:insert_idx].decode('utf-8') | ||
except UnicodeDecodeError as exc: | ||
self._error(exc) | ||
|
||
|
@@ -457,4 +498,4 @@ def starts_with(startstr, something): | |
pos = something.tell() | ||
s = something.read(len(startstr)) | ||
something.seek(pos, os.SEEK_SET) | ||
return (s == startstr) | ||
return (s == startstr) |
Uh oh!
There was an error while loading. Please reload this page.