Permalink
Browse files

implement unicode handling in pure-python version

  • Loading branch information...
1 parent f6dbb40 commit 020fbc92d1fc0e083ea3c30567656e2e62d369fe @rfk committed May 6, 2011
Showing with 63 additions and 23 deletions.
  1. +41 −19 tnetstring/__init__.py
  2. +9 −1 tnetstring/_tnetstring.c
  3. +10 −0 tnetstring/tests/test_format.py
  4. +3 −3 tnetstring/tns_core.c
View
@@ -48,7 +48,7 @@
from collections import deque
-def dumps(value):
+def dumps(value,encoding=None):
"""dumps(object) -> string
This function dumps a python object as a tnetstring.
@@ -60,20 +60,20 @@ def dumps(value):
# consider the _gdumps() function instead; it's a standard top-down
# generator that's simpler to understand but much less efficient.
q = deque()
- _rdumpq(q,0,value)
+ _rdumpq(q,0,value,encoding)
return "".join(q)
-def dump(value, file):
+def dump(value,file,encoding=None):
"""dump(object, file)
This function dumps a python object as a tnetstring and writes it to
the given file.
"""
- file.write(dumps(value))
+ file.write(dumps(value,encoding))
-def _rdumpq(q,size,value):
+def _rdumpq(q,size,value,encoding=None):
"""Dump value as a tnetstring, to a deque instance, last chunks first.
This function generates the tnetstring representation of the given value,
@@ -92,13 +92,13 @@ def _rdumpq(q,size,value):
if value is None:
write("0:~")
return size + 3
- elif value is True:
+ if value is True:
write("4:true!")
return size + 7
- elif value is False:
+ if value is False:
write("5:false!")
return size + 8
- elif isinstance(value,(int,long)):
+ if isinstance(value,(int,long)):
data = str(value)
ldata = len(data)
span = str(ldata)
@@ -107,7 +107,7 @@ def _rdumpq(q,size,value):
write(":")
write(span)
return size + 2 + len(span) + ldata
- elif isinstance(value,(float,)):
+ if isinstance(value,(float,)):
# Use repr() for float rather than str().
# It round-trips more accurately.
# Probably unnecessary in later python versions that
@@ -120,15 +120,15 @@ def _rdumpq(q,size,value):
write(":")
write(span)
return size + 2 + len(span) + ldata
- elif isinstance(value,(str,)):
+ if isinstance(value,str):
lvalue = len(value)
span = str(lvalue)
write(",")
write(value)
write(":")
write(span)
return size + 2 + len(span) + lvalue
- elif isinstance(value,(list,tuple,)):
+ if isinstance(value,(list,tuple,)):
write("]")
init_size = size = size + 1
for item in reversed(value):
@@ -137,7 +137,7 @@ def _rdumpq(q,size,value):
write(":")
write(span)
return size + 1 + len(span)
- elif isinstance(value,(dict,)):
+ if isinstance(value,dict):
write("}")
init_size = size = size + 1
for (k,v) in value.iteritems():
@@ -147,11 +147,21 @@ def _rdumpq(q,size,value):
write(":")
write(span)
return size + 1 + len(span)
- else:
- raise ValueError("unserializable object")
+ if isinstance(value,unicode):
+ if encoding is None:
+ raise ValueError("must specify encoding to dump unicode strings")
+ value = value.encode(encoding)
+ lvalue = len(value)
+ span = str(lvalue)
+ write(",")
+ write(value)
+ write(":")
+ write(span)
+ return size + 2 + len(span) + lvalue
+ raise ValueError("unserializable object")
-def _gdumps(value):
+def _gdumps(value,encoding):
"""Generate fragments of value dumped as a tnetstring.
This is the naive dumping algorithm, implemented as a generator so that
@@ -202,22 +212,30 @@ def _gdumps(value):
yield ":"
yield sub
yield "}"
+ elif isinstance(value,(unicode,)):
+ if encoding is None:
+ raise ValueError("must specify encoding to dump unicode strings")
+ value = value.encode(encoding)
+ yield str(len(value))
+ yield ":"
+ yield value
+ yield ","
else:
raise ValueError("unserializable object")
-def loads(string):
+def loads(string,encoding=None):
"""loads(string) -> object
This function parses a tnetstring into a python object.
"""
# No point duplicating effort here. In the C-extension version,
# loads() is measurably faster then pop() since it can avoid
# the overhead of building a second string.
- return pop(string)[0]
+ return pop(string,encoding)[0]
-def load(file):
+def load(file,encoding=None):
"""load(file) -> object
This function reads a tnetstring from a file and parses it into a
@@ -248,6 +266,8 @@ def load(file):
raise ValueError("not a tnetstring: length prefix too big")
type = file.read(1)
if type == ",":
+ if encoding is not None:
+ return data.decode(encoding)
return data
if type == "#":
try:
@@ -287,7 +307,7 @@ def load(file):
-def pop(string):
+def pop(string,encoding=None):
"""pop(string) -> (object, remain)
This function parses a tnetstring into a python object.
@@ -308,6 +328,8 @@ def pop(string):
raise ValueError("not a tnetstring: invalid length prefix")
# Parse the data based on the type tag.
if type == ",":
+ if encoding is not None:
+ return (data.decode(encoding),remain)
return (data,remain)
if type == "#":
try:
View
@@ -17,6 +17,8 @@
static tns_ops _tnetstring_ops;
+// _tnetstring_loads: parse tnetstring-format value from a string.
+//
static PyObject*
_tnetstring_loads(PyObject* self, PyObject *args)
{
@@ -45,6 +47,12 @@ _tnetstring_loads(PyObject* self, PyObject *args)
}
+// _tnetstring_load: parse tnetstring-format value from a file.
+//
+// This takes care to read no more data than is required to get the
+// full tnetstring-encoded value. It might read arbitrarily-much
+// data if the file doesn't begin with a valid tnetstring.
+//
static PyObject*
_tnetstring_load(PyObject* self, PyObject *args)
{
@@ -83,7 +91,7 @@ _tnetstring_load(PyObject* self, PyObject *args)
}
c = PyString_AS_STRING(res)[0];
Py_DECREF(res); res = NULL;
- // Note that the netsring spec explicitly forbids padding zeroes.
+ // Note that the netstring spec explicitly forbids padding zeroes.
// If the first char is zero, it must be the only char.
if(c < '0' || c > '9') {
PyErr_SetString(PyExc_ValueError,
@@ -89,6 +89,16 @@ def test_roundtrip_format_random(self):
self.assertEqual(v,tnetstring.loads(tnetstring.dumps(v)))
self.assertEqual((v,""),tnetstring.pop(tnetstring.dumps(v)))
+ def test_unicode_handling(self):
+ self.assertRaises(ValueError,tnetstring.dumps,u"hello")
+ self.assertEquals(tnetstring.dumps(u"hello","utf8"),"5:hello,")
+ self.assertEquals(type(tnetstring.loads("5:hello,")),str)
+ self.assertEquals(type(tnetstring.loads("5:hello,","utf8")),unicode)
+ ALPHA = u"\N{GREEK CAPITAL LETTER ALPHA}lpha"
+ self.assertEquals(tnetstring.dumps(ALPHA,"utf8"),"6:"+ALPHA.encode("utf8")+",")
+ self.assertEquals(tnetstring.dumps(ALPHA,"utf16"),"12:"+ALPHA.encode("utf16")+",")
+ self.assertEquals(tnetstring.loads("12:\xff\xfe\x91\x03l\x00p\x00h\x00a\x00,","utf16"),ALPHA)
+
class Test_FileLoading(unittest.TestCase):
View
@@ -13,10 +13,10 @@
#define TNS_MAX_LENGTH 999999999
#endif
-// Current outbuf implementations writes data starting at the back of
-// the allocaed buffer. When finished we simply memmove it to the front.
+// Current outbuf implementation writes data starting at the back of
+// the allocated buffer. When finished we simply memmove it to the front.
// Here *buffer points to the allocated buffer, while *head points to the
-// last characer written to the buffer.
+// last characer written to the buffer (and thus decreases as we write).
struct tns_outbuf_s {
char *buffer;
char *head;

0 comments on commit 020fbc9

Please sign in to comment.