Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

implement unicode handling in pure-python version

  • Loading branch information...
commit 020fbc92d1fc0e083ea3c30567656e2e62d369fe 1 parent f6dbb40
Ryan Kelly authored May 06, 2011
60  tnetstring/__init__.py
@@ -48,7 +48,7 @@
48 48
 from collections import deque
49 49
 
50 50
 
51  
-def dumps(value):
  51
+def dumps(value,encoding=None):
52 52
     """dumps(object) -> string
53 53
 
54 54
     This function dumps a python object as a tnetstring.
@@ -60,20 +60,20 @@ def dumps(value):
60 60
     #  consider the _gdumps() function instead; it's a standard top-down
61 61
     #  generator that's simpler to understand but much less efficient.
62 62
     q = deque()
63  
-    _rdumpq(q,0,value)
  63
+    _rdumpq(q,0,value,encoding)
64 64
     return "".join(q)
65 65
 
66 66
 
67  
-def dump(value, file):
  67
+def dump(value,file,encoding=None):
68 68
     """dump(object, file)
69 69
 
70 70
     This function dumps a python object as a tnetstring and writes it to
71 71
     the given file.
72 72
     """
73  
-    file.write(dumps(value))
  73
+    file.write(dumps(value,encoding))
74 74
 
75 75
 
76  
-def _rdumpq(q,size,value):
  76
+def _rdumpq(q,size,value,encoding=None):
77 77
     """Dump value as a tnetstring, to a deque instance, last chunks first.
78 78
 
79 79
     This function generates the tnetstring representation of the given value,
@@ -92,13 +92,13 @@ def _rdumpq(q,size,value):
92 92
     if value is None:
93 93
         write("0:~")
94 94
         return size + 3
95  
-    elif value is True:
  95
+    if value is True:
96 96
         write("4:true!")
97 97
         return size + 7
98  
-    elif value is False:
  98
+    if value is False:
99 99
         write("5:false!")
100 100
         return size + 8
101  
-    elif isinstance(value,(int,long)):
  101
+    if isinstance(value,(int,long)):
102 102
         data = str(value) 
103 103
         ldata = len(data)
104 104
         span = str(ldata)
@@ -107,7 +107,7 @@ def _rdumpq(q,size,value):
107 107
         write(":")
108 108
         write(span)
109 109
         return size + 2 + len(span) + ldata
110  
-    elif isinstance(value,(float,)):
  110
+    if isinstance(value,(float,)):
111 111
         #  Use repr() for float rather than str().
112 112
         #  It round-trips more accurately.
113 113
         #  Probably unnecessary in later python versions that
@@ -120,7 +120,7 @@ def _rdumpq(q,size,value):
120 120
         write(":")
121 121
         write(span)
122 122
         return size + 2 + len(span) + ldata
123  
-    elif isinstance(value,(str,)):
  123
+    if isinstance(value,str):
124 124
         lvalue = len(value)
125 125
         span = str(lvalue)
126 126
         write(",")
@@ -128,7 +128,7 @@ def _rdumpq(q,size,value):
128 128
         write(":")
129 129
         write(span)
130 130
         return size + 2 + len(span) + lvalue
131  
-    elif isinstance(value,(list,tuple,)):
  131
+    if isinstance(value,(list,tuple,)):
132 132
         write("]")
133 133
         init_size = size = size + 1
134 134
         for item in reversed(value):
@@ -137,7 +137,7 @@ def _rdumpq(q,size,value):
137 137
         write(":")
138 138
         write(span)
139 139
         return size + 1 + len(span)
140  
-    elif isinstance(value,(dict,)):
  140
+    if isinstance(value,dict):
141 141
         write("}")
142 142
         init_size = size = size + 1
143 143
         for (k,v) in value.iteritems():
@@ -147,11 +147,21 @@ def _rdumpq(q,size,value):
147 147
         write(":")
148 148
         write(span)
149 149
         return size + 1 + len(span)
150  
-    else:
151  
-        raise ValueError("unserializable object")
  150
+    if isinstance(value,unicode):
  151
+        if encoding is None:
  152
+            raise ValueError("must specify encoding to dump unicode strings")
  153
+        value = value.encode(encoding)
  154
+        lvalue = len(value)
  155
+        span = str(lvalue)
  156
+        write(",")
  157
+        write(value)
  158
+        write(":")
  159
+        write(span)
  160
+        return size + 2 + len(span) + lvalue
  161
+    raise ValueError("unserializable object")
152 162
 
153 163
 
154  
-def _gdumps(value):
  164
+def _gdumps(value,encoding):
155 165
     """Generate fragments of value dumped as a tnetstring.
156 166
 
157 167
     This is the naive dumping algorithm, implemented as a generator so that
@@ -202,11 +212,19 @@ def _gdumps(value):
202 212
         yield ":"
203 213
         yield sub
204 214
         yield "}"
  215
+    elif isinstance(value,(unicode,)):
  216
+        if encoding is None:
  217
+            raise ValueError("must specify encoding to dump unicode strings")
  218
+        value = value.encode(encoding)
  219
+        yield str(len(value))
  220
+        yield ":"
  221
+        yield value
  222
+        yield ","
205 223
     else:
206 224
         raise ValueError("unserializable object")
207 225
 
208 226
 
209  
-def loads(string):
  227
+def loads(string,encoding=None):
210 228
     """loads(string) -> object
211 229
 
212 230
     This function parses a tnetstring into a python object.
@@ -214,10 +232,10 @@ def loads(string):
214 232
     #  No point duplicating effort here.  In the C-extension version,
215 233
     #  loads() is measurably faster then pop() since it can avoid
216 234
     #  the overhead of building a second string.
217  
-    return pop(string)[0]
  235
+    return pop(string,encoding)[0]
218 236
 
219 237
 
220  
-def load(file):
  238
+def load(file,encoding=None):
221 239
     """load(file) -> object
222 240
 
223 241
     This function reads a tnetstring from a file and parses it into a
@@ -248,6 +266,8 @@ def load(file):
248 266
         raise ValueError("not a tnetstring: length prefix too big")
249 267
     type = file.read(1)
250 268
     if type == ",":
  269
+        if encoding is not None:
  270
+            return data.decode(encoding)
251 271
         return data
252 272
     if type == "#":
253 273
         try:
@@ -287,7 +307,7 @@ def load(file):
287 307
     
288 308
 
289 309
 
290  
-def pop(string):
  310
+def pop(string,encoding=None):
291 311
     """pop(string) -> (object, remain)
292 312
 
293 313
     This function parses a tnetstring into a python object.
@@ -308,6 +328,8 @@ def pop(string):
308 328
         raise ValueError("not a tnetstring: invalid length prefix")
309 329
     #  Parse the data based on the type tag.
310 330
     if type == ",":
  331
+        if encoding is not None:
  332
+            return (data.decode(encoding),remain)
311 333
         return (data,remain)
312 334
     if type == "#":
313 335
         try:
10  tnetstring/_tnetstring.c
@@ -17,6 +17,8 @@
17 17
 
18 18
 static tns_ops _tnetstring_ops;
19 19
 
  20
+//  _tnetstring_loads:  parse tnetstring-format value from a string.
  21
+//
20 22
 static PyObject*
21 23
 _tnetstring_loads(PyObject* self, PyObject *args) 
22 24
 {
@@ -45,6 +47,12 @@ _tnetstring_loads(PyObject* self, PyObject *args)
45 47
 }
46 48
 
47 49
 
  50
+//  _tnetstring_load:  parse tnetstring-format value from a file.
  51
+//
  52
+//  This takes care to read no more data than is required to get the
  53
+//  full tnetstring-encoded value.  It might read arbitrarily-much
  54
+//  data if the file doesn't begin with a valid tnetstring.
  55
+//
48 56
 static PyObject*
49 57
 _tnetstring_load(PyObject* self, PyObject *args) 
50 58
 {
@@ -83,7 +91,7 @@ _tnetstring_load(PyObject* self, PyObject *args)
83 91
   }
84 92
   c = PyString_AS_STRING(res)[0];
85 93
   Py_DECREF(res); res = NULL;
86  
-  //  Note that the netsring spec explicitly forbids padding zeroes.
  94
+  //  Note that the netstring spec explicitly forbids padding zeroes.
87 95
   //  If the first char is zero, it must be the only char.
88 96
   if(c < '0' || c > '9') {
89 97
       PyErr_SetString(PyExc_ValueError,
10  tnetstring/tests/test_format.py
@@ -89,6 +89,16 @@ def test_roundtrip_format_random(self):
89 89
             self.assertEqual(v,tnetstring.loads(tnetstring.dumps(v)))
90 90
             self.assertEqual((v,""),tnetstring.pop(tnetstring.dumps(v)))
91 91
 
  92
+    def test_unicode_handling(self):
  93
+        self.assertRaises(ValueError,tnetstring.dumps,u"hello")
  94
+        self.assertEquals(tnetstring.dumps(u"hello","utf8"),"5:hello,")
  95
+        self.assertEquals(type(tnetstring.loads("5:hello,")),str)
  96
+        self.assertEquals(type(tnetstring.loads("5:hello,","utf8")),unicode)
  97
+        ALPHA = u"\N{GREEK CAPITAL LETTER ALPHA}lpha"
  98
+        self.assertEquals(tnetstring.dumps(ALPHA,"utf8"),"6:"+ALPHA.encode("utf8")+",")
  99
+        self.assertEquals(tnetstring.dumps(ALPHA,"utf16"),"12:"+ALPHA.encode("utf16")+",")
  100
+        self.assertEquals(tnetstring.loads("12:\xff\xfe\x91\x03l\x00p\x00h\x00a\x00,","utf16"),ALPHA)
  101
+
92 102
 
93 103
 class Test_FileLoading(unittest.TestCase):
94 104
 
6  tnetstring/tns_core.c
@@ -13,10 +13,10 @@
13 13
 #define TNS_MAX_LENGTH 999999999
14 14
 #endif
15 15
 
16  
-//  Current outbuf implementations writes data starting at the back of
17  
-//  the allocaed buffer.  When finished we simply memmove it to the front.
  16
+//  Current outbuf implementation writes data starting at the back of
  17
+//  the allocated buffer.  When finished we simply memmove it to the front.
18 18
 //  Here *buffer points to the allocated buffer, while *head points to the
19  
-//  last characer written to the buffer.
  19
+//  last characer written to the buffer (and thus decreases as we write).
20 20
 struct tns_outbuf_s {
21 21
   char *buffer;
22 22
   char *head;

0 notes on commit 020fbc9

Please sign in to comment.
Something went wrong with that request. Please try again.