Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

implement load() and dump() for working with files.

dump() is just a simpler wrapper around dumps(), while load() does some
extra work to ensure it only reads as much data as it needs.
  • Loading branch information...
commit aee8a7b8ac9243b2b20560ee6e4980233e5d3c9e 1 parent 8da0a34
@rfk authored
View
6 README.rst
@@ -7,7 +7,7 @@ tnetstring: data serialization using typed netstrings
This is a data serialization library. It's a lot like JSON but it uses a
new syntax called "typed netstrings" that Zed has proposed for use in the
Mongrel2 webserver. It's designed to be simpler and easier to implement
-than JSON, with a happy consequence of also being faster.
+than JSON, with a happy consequence of also being faster in many cases.
An ordinary netstring is a blob of data prefixed with its length and postfixed
with a sanity-checking comma. The string "hello world" encodes like this::
@@ -37,6 +37,6 @@ When I get around to it, I will also add the following:
Note that since parsing a tnetstring requires reading all the data into memory
at once, there's no efficiency gain from using the file-based versions of these
-functions; I'm only planning to add them for API compatability with other
-serialization modules e.g. pickle and json.
+functions. They're only here so you can use load() to read precisely one
+item from a file or socket without consuming any extra data.
View
176 tnetstring/__init__.py
@@ -7,7 +7,7 @@
This is a data serialization library. It's a lot like JSON but it uses a
new syntax called "typed netstrings" that Zed has proposed for use in the
Mongrel2 webserver. It's designed to be simpler and easier to implement
-than JSON, with a happy consequence of also being faster.
+than JSON, with a happy consequence of also being faster in many cases.
An ordinary netstring is a blob of data prefixed with its length and postfixed
with a sanity-checking comma. The string "hello world" encodes like this::
@@ -37,8 +37,8 @@
Note that since parsing a tnetstring requires reading all the data into memory
at once, there's no efficiency gain from using the file-based versions of these
-functions; I'm only planning to add them for API compatability with other
-serialization modules e.g. pickle and json.
+functions. They're only here so you can use load() to read precisely one
+item from a file or socket without consuming any extra data.
"""
@@ -52,12 +52,6 @@
from collections import deque
-try:
- from cStringIO import StringIO
-except ImportError:
- from StringIO import StringIO
-
-
class Error(Exception):
"""Base error class for the tnetstring module."""
pass
@@ -77,75 +71,24 @@ def dumps(value):
This function dumps a python object as a tnetstring.
"""
- # This uses the naive bottom-up generator, it's quite slow.
- #return "".join(_gdumps(value))
- # This uses the write-in-reverse trick from the C version.
- # It's faster, but loses a lot in reversing the python strings.
- #s = StringIO()
- #_rdumps(s,value)
- #return s.getvalue()[::-1]
- # This uses a deque to collect output fragments in reverse order.
- # It's slightly faster than the _rdumps version as the strings
- # don't need to be reversed when outputting them.
+ # This uses a deque to collect output fragments in reverse order,
+ # then joins them together at the end. It's measurably faster
+ # than creating all the intermediate strings.
+ # If you're reading this to get a handle on the tnetstring format,
+ # consider the _gdumps() function instead; it's a standard top-down
+ # generator that's simpler to understand but much less efficient.
q = deque()
_rdumpq(q,0,value)
return "".join(q)
-def _rdumps(s,value):
- """Dump value as a tnetstring, to a StringIO instance, in reverse.
-
- This function writes out the tnetstring representation of the given value
- to the given StringIO instance, in reverse. Yes, in reverse.
+def dump(value, file):
+ """dump(object, file)
- Writing in reverse makes it easier to calculate all the length prefixes
- without building every little intermediate string. Unfortunately it means
- we have to reverse the string for each literal, but it pays off compared
- to the naive version.
+ This function dumps a python object as a tnetstring and writes it to
+ the given file.
"""
- write = s.write
- if value is None:
- write("~:0")
- elif value is True:
- write("!eurt:4")
- elif value is False:
- write("!eslaf:5")
- elif isinstance(value,(int,long)):
- data = str(value)
- write("#")
- write(data[::-1])
- write(":")
- write(str(len(data))[::-1])
- elif isinstance(value,(float,)):
- data = repr(value)
- write("#")
- write(data[::-1])
- write(":")
- write(str(len(data))[::-1])
- elif isinstance(value,(str,)):
- write(",")
- write(value[::-1])
- write(":")
- write(str(len(value))[::-1])
- elif isinstance(value,(list,tuple,)):
- write("]")
- i = s.tell()
- for item in reversed(value):
- _rdumps(s,item)
- i = s.tell() - i
- write(":")
- write(str(i)[::-1])
- elif isinstance(value,(dict,)):
- write("}")
- i = s.tell()
- for (k,v) in value.iteritems():
- _rdumps(s,v)
- _rdumps(s,k)
- i = s.tell() - i
- write(":")
- write(str(i)[::-1])
- else:
- raise DumpError("unserializable object")
+ file.write(dumps(value))
def _rdumpq(q,size,value):
@@ -160,8 +103,8 @@ def _rdumpq(q,size,value):
Operating last-chunk-first makes it easy to calculate the size written
for recursive structures without having to build their representation as
- a string. This is measurably faster than the _rdumps version because
- it avoid having to reverse lots of strings.
+ a string. This is measurably faster than generating the intermediate
+ strings, especially on deeply nested structures.
"""
write = q.appendleft
if value is None:
@@ -183,6 +126,10 @@ def _rdumpq(q,size,value):
write(span)
return size + 2 + len(span) + ldata
elif isinstance(value,(float,)):
+ # Use repr() for float rather than str().
+ # It round-trips more accurately.
+ # Probably unnecessary in later python versions that
+ # use David Gay's ftoa routines.
data = repr(value)
ldata = len(data)
span = str(ldata)
@@ -201,20 +148,20 @@ def _rdumpq(q,size,value):
return size + 2 + len(span) + lvalue
elif isinstance(value,(list,tuple,)):
write("]")
- i = size = size + 1
+ init_size = size = size + 1
for item in reversed(value):
size = _rdumpq(q,size,item)
- span = str(size - i)
+ span = str(size - init_size)
write(":")
write(span)
return size + 1 + len(span)
elif isinstance(value,(dict,)):
write("}")
- i = size = size + 1
+ init_size = size = size + 1
for (k,v) in value.iteritems():
size = _rdumpq(q,size,v)
size = _rdumpq(q,size,k)
- span = str(size - i)
+ span = str(size - init_size)
write(":")
write(span)
return size + 1 + len(span)
@@ -228,8 +175,8 @@ def _gdumps(value):
This is the naive dumping algorithm, implemented as a generator so that
it's easy to pass to "".join() without building a new list.
- This is mainly here for experimentation purposes; the _rdumps and _rdumpq
- versions are measurably faster.
+ This is mainly here for comparison purposes; the _rdumpq version is
+ measurably faster as it doesn't have to build intermediate strins.
"""
if value is None:
yield "0:~"
@@ -288,6 +235,72 @@ def loads(string):
return pop(string)[0]
+def load(file):
+ """load(file) -> object
+
+ This function reads a tnetstring from a file and parses it into a
+ python object. The file must support the read() method, and this
+ function promises not to read more data than necessary.
+ """
+ # Read the length prefix one char at a time.
+ c = file.read(1)
+ if not c.isdigit():
+ raise LoadError("not a tnetstring: missing or invalid length prefix")
+ datalen = ord(c) - ord("0")
+ c = file.read(1)
+ while c.isdigit():
+ datalen = (10 * datalen) + (ord(c) - ord("0"))
+ c = file.read(1)
+ if c != ":":
+ raise LoadError("not a tnetstring: missing or invalid length prefix")
+ # Now we can read and parse the payload.
+ # This repeats the dispatch logic of pop() so we can avoid
+ # re-constructing the outermost tnetstring.
+ data = file.read(datalen)
+ if len(data) != datalen:
+ raise LoadError("not a tnetstring: length prefix too big")
+ type = file.read(1)
+ if type == ",":
+ return data
+ if type == "#":
+ if "." in data or "e" in data or "E" in data:
+ try:
+ return float(data)
+ except ValueError:
+ raise LoadError("not a tnetstring: invalid float literal")
+ else:
+ try:
+ return int(data)
+ except ValueError:
+ raise LoadError("not a tnetstring: invalid integer literal")
+ if type == "!":
+ if data == "true":
+ return True
+ elif data == "false":
+ return False
+ else:
+ raise LoadError("not a tnetstring: invalid boolean literal")
+ if type == "~":
+ if data:
+ raise LoadError("not a tnetstring: invalid null literal")
+ return None
+ if type == "]":
+ l = []
+ while data:
+ (item,data) = pop(data)
+ l.append(item)
+ return l
+ if type == "}":
+ d = {}
+ while data:
+ (key,data) = pop(data)
+ (val,data) = pop(data)
+ d[key] = val
+ return d
+ raise LoadError("unknown type tag")
+
+
+
def pop(string):
"""pop(string) -> (object, remain)
@@ -305,7 +318,7 @@ def pop(string):
(data,type,remain) = (rest[:dlen],rest[dlen],rest[dlen+1:])
except IndexError:
# This fires if len(rest) < dlen, meaning we don't need
- # to validate that data is the right length.
+ # to further validate that data is the right length.
raise LoadError("not a tnetstring: invalid length prefix")
# Parse the data based on the type tag.
if type == ",":
@@ -358,9 +371,8 @@ def pop(string):
Error = _tnetstring.Error
LoadError = _tnetstring.LoadError
DumpError = _tnetstring.DumpError
- #dump = _tnetstring.dump
dumps = _tnetstring.dumps
- #load = _tnetstring.load
+ load = _tnetstring.load
loads = _tnetstring.loads
pop = _tnetstring.pop
View
120 tnetstring/_tnetstring.c
@@ -5,6 +5,7 @@
//
// dumps: dump a python object to a tnetstring
// loads: parse tnetstring into a python object
+// load: parse tnetstring from a file-like object
// pop: parse tnetstring into a python object,
// return it along with unparsed data.
@@ -47,6 +48,118 @@ _tnetstring_loads(PyObject* self, PyObject *args)
static PyObject*
+_tnetstring_load(PyObject* self, PyObject *args)
+{
+ PyObject *val = NULL;
+ PyObject *file = NULL;
+ PyObject *methnm = NULL;
+ PyObject *metharg = NULL;
+ PyObject *res = NULL;
+ char c, *data;
+ size_t datalen = 0;
+
+ // Grab file-like object as only argument
+ if(!PyArg_UnpackTuple(args, "load", 1, 1, &file)) {
+ goto error;
+ }
+ Py_INCREF(file);
+
+ // We're going to read one char at a time
+ if((methnm = PyString_FromString("read")) == NULL) {
+ goto error;
+ }
+ if((metharg = PyInt_FromLong(1)) == NULL) {
+ goto error;
+ }
+
+ // Read the length prefix one char at a time
+ res = PyObject_CallMethodObjArgs(file, methnm, metharg, NULL);
+ if(res == NULL) {
+ goto error;
+ }
+ Py_INCREF(res);
+ if(!PyString_Check(res) || !PyString_GET_SIZE(res)) {
+ PyErr_SetString(_tnetstring_Error,
+ "Not a tnetstring: invlaid or missing length prefix");
+ goto error;
+ }
+ c = PyString_AS_STRING(res)[0];
+ Py_DECREF(res); res = NULL;
+ if(c < '0' || c > '9') {
+ PyErr_SetString(_tnetstring_Error,
+ "Not a tnetstring: invlaid or missing length prefix");
+ goto error;
+ }
+ do {
+ datalen = (10 * datalen) + (c - '0');
+ res = PyObject_CallMethodObjArgs(file, methnm, metharg, NULL);
+ if(res == NULL) {
+ goto error;
+ }
+ Py_INCREF(res);
+ if(!PyString_Check(res) || !PyString_GET_SIZE(res)) {
+ PyErr_SetString(_tnetstring_Error,
+ "Not a tnetstring: invlaid or missing length prefix");
+ goto error;
+ }
+ c = PyString_AS_STRING(res)[0];
+ Py_DECREF(res); res = NULL;
+ } while(c >= '0' && c <= '9');
+
+ // Validate end-of-length-prefix marker.
+ if(c != ':') {
+ PyErr_SetString(_tnetstring_Error,
+ "Not a tnetstring: missing length prefix");
+ goto error;
+ }
+
+ // Read the data plus terminating type tag.
+ Py_DECREF(metharg);
+ if((metharg = PyInt_FromSize_t(datalen + 1)) == NULL) {
+ goto error;
+ }
+ res = PyObject_CallMethodObjArgs(file, methnm, metharg, NULL);
+ if(res == NULL) {
+ goto error;
+ }
+ Py_INCREF(res);
+ Py_DECREF(file); file = NULL;
+ Py_DECREF(methnm); methnm = NULL;
+ Py_DECREF(metharg); metharg = NULL;
+ if(!PyString_Check(res) || PyString_GET_SIZE(res) != datalen + 1) {
+ PyErr_SetString(_tnetstring_Error,
+ "Not a tnetstring: invalid length prefix");
+ goto error;
+ }
+
+ // Parse out the payload object
+ data = PyString_AS_STRING(res);
+ val = tns_parse_payload(data[datalen], data, datalen);
+ Py_DECREF(res); res = NULL;
+
+ return val;
+
+error:
+ if(file != NULL) {
+ Py_DECREF(file);
+ }
+ if(methnm != NULL) {
+ Py_DECREF(methnm);
+ }
+ if(metharg != NULL) {
+ Py_DECREF(metharg);
+ }
+ if(res != NULL) {
+ Py_DECREF(res);
+ }
+ if(val != NULL) {
+ Py_DECREF(val);
+ }
+ return NULL;
+}
+
+
+static PyObject*
_tnetstring_pop(PyObject* self, PyObject *args)
{
PyObject *string, *val, *rest;
@@ -110,6 +223,13 @@ _tnetstring_dumps(PyObject* self, PyObject *args, PyObject *kwds)
static PyMethodDef _tnetstring_methods[] = {
+ {"load",
+ (PyCFunction)_tnetstring_load,
+ METH_VARARGS,
+ PyDoc_STR("load(file) -> object\n"
+ "This function reads a tnetstring from a file and parses it\n"
+ " into a python object.")},
+
{"loads",
(PyCFunction)_tnetstring_loads,
METH_VARARGS,
View
22 tnetstring/tests/test_format.py
@@ -85,3 +85,25 @@ def test_roundtrip_format_random(self):
self.assertEqual(v,tnetstring.loads(tnetstring.dumps(v)))
self.assertEqual((v,""),tnetstring.pop(tnetstring.dumps(v)))
+
+class Test_FileLoading(unittest.TestCase):
+
+ def test_roundtrip_file_examples(self):
+ for data, expect in FORMAT_EXAMPLES.items():
+ s = StringIO.StringIO()
+ s.write(data)
+ s.seek(0)
+ self.assertEqual(expect,tnetstring.load(s))
+ s = StringIO.StringIO()
+ tnetstring.dump(expect,s)
+ s.seek(0)
+ self.assertEqual(expect,tnetstring.load(s))
+
+ def test_roundtrip_format_random(self):
+ for _ in xrange(500):
+ v = get_random_object()
+ s = StringIO.StringIO()
+ tnetstring.dump(v,s)
+ s.seek(0)
+ self.assertEqual(v,tnetstring.load(s))
+
View
31 tnetstring/tns_core.c
@@ -37,7 +37,6 @@ static size_t tns_strtosz(const char *data, size_t len, size_t *sz, char **end);
static void* tns_parse(const char *data, size_t len, char **remain)
{
- void *val = NULL;
char *valstr = NULL;
tns_type_tag type = tns_tag_null;
size_t vallen = 0;
@@ -59,24 +58,35 @@ static void* tns_parse(const char *data, size_t len, char **remain)
}
// Now dispatch type parsing based on the type tag.
+ return tns_parse_payload(type, valstr, vallen);
+
+error:
+ return NULL;
+}
+
+
+static void* tns_parse_payload(tns_type_tag type, const char *data, size_t len)
+{
+ void *val = NULL;
+
switch(type) {
// Primitive type: a string blob.
case tns_tag_string:
- val = tns_parse_string(valstr, vallen);
+ val = tns_parse_string(data, len);
check(val != NULL, "Not a tnetstring: invalid string literal.");
break;
// Primitive type: a number.
// I'm branching out here and allowing both floats and ints.
case tns_tag_number:
- val = tns_parse_number(valstr, vallen);
+ val = tns_parse_number(data, len);
check(val != NULL, "Not a tnetstring: invalid number literal.");
break;
// Primitive type: a boolean.
// The only acceptable values are "true" and "false".
case tns_tag_bool:
- if(vallen == 4 && STR_EQ_TRUE(valstr)) {
+ if(len == 4 && STR_EQ_TRUE(data)) {
val = tns_get_true();
- } else if(vallen == 5 && STR_EQ_FALSE(valstr)) {
+ } else if(len == 5 && STR_EQ_FALSE(data)) {
val = tns_get_false();
} else {
sentinel("Not a tnetstring: invalid boolean literal.");
@@ -86,21 +96,21 @@ static void* tns_parse(const char *data, size_t len, char **remain)
// Primitive type: a null.
// This must be a zero-length string.
case tns_tag_null:
- check(vallen == 0, "Not a tnetstring: invalid null literal");
+ check(len == 0, "Not a tnetstring: invalid null literal");
val = tns_get_null();
break;
// Compound type: a dict.
// The data is written <key><value><key><value>
case tns_tag_dict:
val = tns_new_dict();
- check(tns_parse_dict(val,valstr,vallen) != -1,
+ check(tns_parse_dict(val,data,len) != -1,
"Not a tnetstring: broken dict items.");
break;
// Compound type: a list.
// The data is written <item><item><item>
case tns_tag_list:
val = tns_new_list();
- check(tns_parse_list(val,valstr,vallen) != -1,
+ check(tns_parse_list(val,data,len) != -1,
"Not a tnetstring: broken list items.");
break;
// Whoops, that ain't a tnetstring.
@@ -252,7 +262,8 @@ static int tns_parse_dict(void *val, const char *data, size_t len)
static inline size_t
tns_strtosz(const char *data, size_t len, size_t *sz, char **end)
{
- char *pos, *eod, c;
+ char c;
+ const char *pos, *eod;
size_t value = 0;
pos = data;
@@ -271,7 +282,7 @@ tns_strtosz(const char *data, size_t len, size_t *sz, char **end)
c = *pos;
if(c < '0' || c > '9') {
*sz = value;
- *end = pos;
+ *end = (char*) pos;
return 0;
}
value = (value * 10) + (c - '0');
View
5 tnetstring/tns_core.h
@@ -75,6 +75,11 @@ static void tns_free_value(void *value);
// receive the unparsed remainder of the string.
static void* tns_parse(const char *data, size_t len, char** remain);
+// If you need to read the length prefix yourself, e.g. because you're
+// reading data off a socket, you can use this function to get just
+// the payload parsing logic.
+static void* tns_parse_payload(tns_type_tag type, const char *data, size_t len);
+
// Render an object into a string.
// On success this function returns a malloced string containing
// the serialization of the given object. The second argument
Please sign in to comment.
Something went wrong with that request. Please try again.