Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 36 additions & 21 deletions llsd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,55 @@

http://wiki.secondlife.com/wiki/LLSD
"""
from llsd.base import (_LLSD, BINARY_MIME_TYPE, NOTATION_MIME_TYPE, XML_MIME_TYPE, LLSDParseError,
LLSDSerializationError, LongType, UnicodeType, binary, starts_with, undef, uri)
from llsd.serde_binary import LLSDBinaryParser, format_binary, parse_binary
from llsd.serde_notation import LLSDNotationFormatter, LLSDNotationParser, format_notation, parse_notation
from llsd.serde_xml import LLSDXMLFormatter, LLSDXMLPrettyFormatter, format_pretty_xml, format_xml, parse_xml
from llsd.base import (_LLSD, BINARY_MIME_TYPE, NOTATION_MIME_TYPE, XML_MIME_TYPE,
BINARY_HEADER, NOTATION_HEADER, XML_HEADER,
LLSDBaseParser, LLSDParseError, LLSDSerializationError,
LongType, UnicodeType, binary, undef, uri)
from llsd.serde_binary import LLSDBinaryParser, format_binary, parse_binary, parse_binary_nohdr
from llsd.serde_notation import LLSDNotationFormatter, LLSDNotationParser, format_notation, parse_notation, parse_notation_nohdr
from llsd.serde_xml import LLSDXMLFormatter, LLSDXMLPrettyFormatter, format_pretty_xml, format_xml, parse_xml, parse_xml_nohdr


def parse(something, mime_type = None):
"""
This is the basic public interface for parsing llsd.

:param something: The data to parse. This is expected to be bytes, not strings
:param something: The data to parse. This is expected to be bytes, not
strings, or a byte stream.
:param mime_type: The mime_type of the data if it is known.
:returns: Returns a python object.

Python 3 Note: when reading LLSD from a file, use open()'s 'rb' mode explicitly
"""
if mime_type in (XML_MIME_TYPE, 'application/llsd'):
return parse_xml(something)
elif mime_type == BINARY_MIME_TYPE:
return parse_binary(something)
elif mime_type == NOTATION_MIME_TYPE:
return parse_notation(something)
#elif content_type == 'application/json':
# return parse_notation(something)
try:
something = something.lstrip() #remove any pre-trailing whitespace
if starts_with(b'<?llsd/binary?>', something):
return parse_binary(something)
# This should be better.
elif starts_with(b'<', something):
return parse_xml(something)
if mime_type:
# explicit mime_type -- 'something' may or may not also have a header
for mime_types, parser in (
({XML_MIME_TYPE, 'application/llsd'}, parse_xml),
({BINARY_MIME_TYPE}, parse_binary),
({NOTATION_MIME_TYPE}, parse_notation),
## ({'application/json'}, parse_notation),
):
if mime_type.lower() in mime_types:
return parser(something)

# no recognized mime type, look for header
baseparser = LLSDBaseParser(something)
for pattern, parser in (
(BINARY_HEADER, parse_binary_nohdr),
(NOTATION_HEADER, parse_notation_nohdr),
(XML_HEADER, parse_xml_nohdr),
):
if baseparser.matchseq(pattern):
# we already saw the header, don't check again
return parser(baseparser)

# no recognized header -- does content resemble XML?
if baseparser.starts_with(b'<'):
return parse_xml_nohdr(baseparser)
else:
return parse_notation(something)
return parse_notation_nohdr(baseparser)

except KeyError as e:
raise LLSDParseError('LLSD could not be parsed: %s' % (e,))
except TypeError as e:
Expand Down
190 changes: 127 additions & 63 deletions llsd/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import base64
import binascii
import datetime
import io
import os
import re
import sys
Expand All @@ -24,6 +25,10 @@
BINARY_MIME_TYPE = 'application/llsd+binary'
NOTATION_MIME_TYPE = 'application/llsd+notation'

XML_HEADER = b'<? llsd/xml ?>'
BINARY_HEADER = b'<? llsd/binary ?>'
NOTATION_HEADER = b'<? llsd/notation ?>'

ALL_CHARS = str(bytearray(range(256))) if PY2 else bytes(range(256))


Expand Down Expand Up @@ -78,12 +83,6 @@ class LLSDSerializationError(TypeError):
except NameError:
UnicodeType = str

# can't just check for NameError: 'bytes' is defined in both Python 2 and 3
if PY2:
BytesType = str
else:
BytesType = bytes

try:
b'%s' % (b'yes',)
except TypeError:
Expand Down Expand Up @@ -141,19 +140,6 @@ def B(fmt):
return fmt


class PY3SemanticBytes(BytesType):
"""Wrapper to make `buffer[n]` return an integer like in Py3"""
__slots__ = []

def __getitem__(self, item):
ret = super(PY3SemanticBytes, self).__getitem__(item)
# `buffer[n]` should return an integer, but slice syntax like
# `buffer[n:n+1]` should still return a `Bytes` object as before.
if is_integer(item):
return ord(ret)
return ret


def is_integer(o):
""" portable test if an object is like an int """
return isinstance(o, IntTypes)
Expand All @@ -169,11 +155,6 @@ def is_string(o):
return isinstance(o, StringTypes)


def is_bytes(o):
""" portable check if an object is an immutable byte array """
return isinstance(o, BytesType)


#date: d"YYYY-MM-DDTHH:MM:SS.FFFFFFZ"
_date_regex = re.compile(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T"
r"(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})"
Expand Down Expand Up @@ -375,36 +356,136 @@ class LLSDBaseParser(object):
"""
Utility methods useful for parser subclasses.
"""
__slots__ = ['_buffer', '_index', '_decode_buff']
__slots__ = ['_stream', '_decode_buff']

def __init__(self):
self._buffer = b''
self._index = 0
def __init__(self, something=b''):
self._reset(something)
# Scratch space for decoding delimited strings
self._decode_buff = bytearray(_DECODE_BUFF_ALLOC_SIZE)

def _error(self, message, offset=0):
def _reset(self, something):
if isinstance(something, LLSDBaseParser):
# When passed an existing LLSDBaseParser (subclass) instance, just
# borrow its existing _stream.
self._stream = something._stream
elif isinstance(something, bytes):
# Wrap an incoming bytes string into a stream. If the passed bytes
# string is so large that the overhead of copying it into a
# BytesIO is significant, advise caller to pass a stream instead.
# BytesIO has no peek() method, so wrap it in BufferedReader.
self._stream = io.BufferedReader(io.BytesIO(something))
elif hasattr(something, 'peek'):
# 'something' is already a buffered stream, use directly
self._stream = something
else:
# 'something' isn't buffered, wrap in BufferedReader
# (let BufferedReader handle the problem of passing an
# inappropriate object)
self._stream = io.BufferedReader(something)

def starts_with(self, pattern):
"""
Like matchseq(), except that starts_with() doesn't consume what it
matches: it always resets our input stream to its previous position.
"""
oldpos = self._stream.tell()
try:
byte = self._buffer[self._index+offset]
except IndexError:
byte = None
raise LLSDParseError("%s at byte %d: %s" % (message, self._index+offset, byte))
return self.matchseq(pattern)
finally:
self._stream.seek(oldpos)

def _peek(self, num=1):
def matchseq(self, pattern):
"""
Match bytes object 'pattern' after skipping arbitrary leading
whitespace. After successfully matching 'pattern', skip trailing
whitespace as well.

'pattern' is NOT a regular expression, but a bytes string in which
each space character matches zero or more whitespace characters in the
stream. Non-space characters are matched case-insensitively.

If 'pattern' matches, return True and leave our input stream advanced
past the last byte examined.

If 'pattern' does not match, return False and reset our input stream
to its previous read position.
"""
oldpos = self._stream.tell()
for chunk in pattern.split():
# skip leading space before this chunk
c = self._next_nonblank()
# if we hit EOF, no match
if not c:
self._stream.seek(oldpos)
return False
# not EOF: try to match non-empty chunk,
# not forgetting that 'c' is a lookahead byte
# (split() never produces a zero-length chunk)
maybe = c + self._stream.read(len(chunk)-1)
if maybe.lower() != chunk.lower():
# mismatch, reset
self._stream.seek(oldpos)
return False
# so far so good, back for next chunk

# here we've matched every chunk, with the read pointer just at the end of
# the last matched chunk -- skip trailing space
if self._next_nonblank():
# back up one character, i.e. put back the nonblank
self._stream.seek(-1, io.SEEK_CUR)
# success!
return True

def remainder(self):
# return a stream object representing the parse input (from last
# _reset() call), whose read position is set past scanned input
return self._stream

def _next_nonblank(self):
# we directly call read() rather than getc() because our caller is
# prepared to handle empty string, meaning EOF
# (YES we want the walrus operator)
c = self._stream.read(1)
while c.isspace():
c = self._stream.read(1)
return c

def _getc(self, num=1):
got = self._stream.read(num)
if len(got) < num:
self._error("Trying to read past end of stream")
return got

def _peek(self, num=1, full=True):
# full=True means error if we can't peek ahead num bytes
if num < 0:
# There aren't many ways this can happen. The likeliest is that
# we've just read garbage length bytes from a binary input string.
# We happen to know that lengths are encoded as 4 bytes, so back
# off by 4 bytes to try to point the user at the right spot.
self._error("Invalid length field %d" % num, -4)
if self._index + num > len(self._buffer):
self._error("Trying to read past end of buffer")
return self._buffer[self._index:self._index + num]

def _getc(self, num=1):
chars = self._peek(num)
self._index += num
return chars
got = self._stream.peek(num)
if full and len(got) < num:
# Going right to this error is a little iffy:
# BufferedReader.peek() does not promise to return the requested
# length, but does not clarify the conditions under which it
# returns fewer bytes. If this is an actual problem, we could loop
# until we have the requested length or EOF -- but the loop must
# explicitly seek() past already-peeked data, then reset after.
# https://docs.python.org/3/library/io.html#io.BufferedReader.peek
self._error("Trying to peek past end of stream")

# Interestingly, peek() can also return MORE than requested -- but for
# our purposes (e.g. ord(peek(1))) it's important to constrain it.
return got[:num]

def _error(self, message, offset=0):
oldpos = self._stream.tell()
# 'offset' is relative to current pos
self._stream.seek(offset, io.SEEK_CUR)
raise LLSDParseError("%s at byte %d: %r" %
(message, oldpos+offset, self._peek(1, full=False)))

# map char following escape char to corresponding character
_escaped = {
Expand All @@ -424,30 +505,26 @@ def _parse_string_delim(self, delim):
# Preallocate a working buffer for the decoded string output
# to avoid allocs in the hot loop.
decode_buff = self._decode_buff
# Cache these in locals, otherwise we have to perform a lookup on
# Cache this in locals, otherwise we have to perform a lookup on
# `self` in the hot loop.
buff = self._buffer
read_idx = self._index
getc = self._getc
cc = 0
while True:
try:
cc = buff[read_idx]
read_idx += 1
cc = ord(getc())

if cc == _BACKSLASH_ORD:
# Backslash, figure out if this is an \xNN hex escape or
# something like \t
cc = buff[read_idx]
read_idx += 1
cc = ord(getc())
if cc == _X_ORD:
# It's a hex escape. char is the value of the two
# following hex nybbles. This slice may result in
# a short read (0 or 1 bytes), but either a
# `ValueError` will be triggered by the first case,
# and the second will cause an `IndexError` on the
# next iteration of the loop.
hex_bytes = buff[read_idx:read_idx + 2]
read_idx += 2
hex_bytes = getc(2)
try:
# int() can parse a `bytes` containing hex,
# no explicit `bytes.decode("ascii")` required.
Expand All @@ -456,7 +533,6 @@ def _parse_string_delim(self, delim):
# One of the hex characters was likely invalid.
# Wrap the ValueError so that we can provide a
# byte offset in the error.
self._index = read_idx
self._error(e, offset=-2)
else:
# escape char preceding anything other than the chars
Expand All @@ -468,7 +544,6 @@ def _parse_string_delim(self, delim):
except IndexError:
# We can be reasonably sure that any IndexErrors inside here
# were caused by an out-of-bounds `buff[read_idx]`.
self._index = read_idx
self._error("Trying to read past end of buffer")

try:
Expand All @@ -483,19 +558,8 @@ def _parse_string_delim(self, delim):
insert_idx += 1

# Sync our local read index with the canonical one
self._index = read_idx
try:
# Slice off only what we used of the working decode buffer
return decode_buff[:insert_idx].decode('utf-8')
except UnicodeDecodeError as exc:
self._error(exc)


def starts_with(startstr, something):
if hasattr(something, 'startswith'):
return something.startswith(startstr)
else:
pos = something.tell()
s = something.read(len(startstr))
something.seek(pos, os.SEEK_SET)
return (s == startstr)
Loading