Skip to content

Commit

Permalink
Issue #7451: Improve decoding performance of JSON objects, and reduce
Browse files Browse the repository at this point in the history
the memory consumption of said decoded objects when they use the same
strings as keys.
  • Loading branch information
pitrou committed Sep 4, 2010
1 parent d9107aa commit 7d6e076
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 48 deletions.
10 changes: 8 additions & 2 deletions Lib/json/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,14 @@ def py_scanstring(s, end, strict=True,


def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
_w=WHITESPACE.match, _ws=WHITESPACE_STR):
memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
s, end = s_and_end
pairs = []
pairs_append = pairs.append
# Backwards compatibility
if memo is None:
memo = {}
memo_get = memo.setdefault
# Use a slice to prevent IndexError from being raised, the following
# check will raise a more specific ValueError if the string is empty
nextchar = s[end:end + 1]
Expand All @@ -167,6 +171,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
end += 1
while True:
key, end = scanstring(s, end, strict)
key = memo_get(key, key)
# To skip some function call overhead we optimize the fast paths where
# the JSON key separator is ": " or just ":".
if s[end:end + 1] != ':':
Expand Down Expand Up @@ -214,7 +219,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
pairs = object_hook(pairs)
return pairs, end

def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
s, end = s_and_end
values = []
nextchar = s[end:end + 1]
Expand Down Expand Up @@ -314,6 +319,7 @@ def __init__(self, object_hook=None, parse_float=None,
self.parse_object = JSONObject
self.parse_array = JSONArray
self.parse_string = scanstring
self.memo = {}
self.scan_once = make_scanner(self)


Expand Down
10 changes: 9 additions & 1 deletion Lib/json/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def py_make_scanner(context):
parse_int = context.parse_int
parse_constant = context.parse_constant
object_hook = context.object_hook
object_pairs_hook = context.object_pairs_hook
memo = context.memo

def _scan_once(string, idx):
try:
Expand All @@ -33,7 +35,7 @@ def _scan_once(string, idx):
return parse_string(string, idx + 1, strict)
elif nextchar == '{':
return parse_object((string, idx + 1), strict,
_scan_once, object_hook, object_pairs_hook)
_scan_once, object_hook, object_pairs_hook, memo)
elif nextchar == '[':
return parse_array((string, idx + 1), _scan_once)
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
Expand All @@ -60,6 +62,12 @@ def _scan_once(string, idx):
else:
raise StopIteration

def scan_once(string, idx):
try:
return _scan_once(string, idx)
finally:
memo.clear()

return _scan_once

make_scanner = c_make_scanner or py_make_scanner
28 changes: 28 additions & 0 deletions Lib/json/tests/test_decode.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
import decimal
from unittest import TestCase
from io import StringIO
from contextlib import contextmanager

import json
import json.decoder
import json.scanner
from collections import OrderedDict


@contextmanager
def use_python_scanner():
py_scanner = json.scanner.py_make_scanner
old_scanner = json.decoder.make_scanner
json.decoder.make_scanner = py_scanner
try:
yield
finally:
json.decoder.make_scanner = old_scanner


class TestDecode(TestCase):
def test_decimal(self):
rval = json.loads('1.1', parse_float=decimal.Decimal)
Expand Down Expand Up @@ -39,3 +54,16 @@ def test_decoder_optimizations(self):
# exercise the uncommon cases. The array cases are already covered.
rval = json.loads('{ "key" : "value" , "k":"v" }')
self.assertEquals(rval, {"key":"value", "k":"v"})

def check_keys_reuse(self, source, loads):
rval = loads(source)
(a, b), (c, d) = sorted(rval[0]), sorted(rval[1])
self.assertIs(a, c)
self.assertIs(b, d)

def test_keys_reuse(self):
s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'
self.check_keys_reuse(s, json.loads)
# Disabled: the pure Python version of json simply doesn't work
with use_python_scanner():
self.check_keys_reuse(s, json.decoder.JSONDecoder().decode)
4 changes: 4 additions & 0 deletions Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@ Extensions
Library
-------

- Issue #7451: Improve decoding performance of JSON objects, and reduce
the memory consumption of said decoded objects when they use the same
strings as keys.

- Issue #1100562: Fix deep-copying of objects derived from the list and
dict types. Patch by Michele Orrù and Björn Lindqvist.

Expand Down
127 changes: 82 additions & 45 deletions Modules/_json.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ typedef struct _PyScannerObject {
PyObject *parse_float;
PyObject *parse_int;
PyObject *parse_constant;
PyObject *memo;
} PyScannerObject;

static PyMemberDef scanner_members[] = {
Expand Down Expand Up @@ -305,6 +306,21 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
return tpl;
}

#define APPEND_OLD_CHUNK \
if (chunk != NULL) { \
if (chunks == NULL) { \
chunks = PyList_New(0); \
if (chunks == NULL) { \
goto bail; \
} \
} \
if (PyList_Append(chunks, chunk)) { \
Py_DECREF(chunk); \
goto bail; \
} \
Py_CLEAR(chunk); \
}

static PyObject *
scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
{
Expand All @@ -316,23 +332,21 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
Return value is a new PyUnicode
*/
PyObject *rval;
PyObject *rval = NULL;
Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
Py_ssize_t begin = end - 1;
Py_ssize_t next = begin;
const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
PyObject *chunks = PyList_New(0);
if (chunks == NULL) {
goto bail;
}
PyObject *chunks = NULL;
PyObject *chunk = NULL;

if (end < 0 || len <= end) {
PyErr_SetString(PyExc_ValueError, "end is out of bounds");
goto bail;
}
while (1) {
/* Find the end of the string or the next escape */
Py_UNICODE c = 0;
PyObject *chunk = NULL;
for (next = end; next < len; next++) {
c = buf[next];
if (c == '"' || c == '\\') {
Expand All @@ -349,15 +363,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
}
/* Pick up this chunk if it's not zero length */
if (next != end) {
APPEND_OLD_CHUNK
chunk = PyUnicode_FromUnicode(&buf[end], next - end);
if (chunk == NULL) {
goto bail;
}
if (PyList_Append(chunks, chunk)) {
Py_DECREF(chunk);
goto bail;
}
Py_DECREF(chunk);
}
next++;
if (c == '"') {
Expand Down Expand Up @@ -459,27 +469,34 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
}
#endif
}
APPEND_OLD_CHUNK
chunk = PyUnicode_FromUnicode(&c, 1);
if (chunk == NULL) {
goto bail;
}
if (PyList_Append(chunks, chunk)) {
Py_DECREF(chunk);
}

if (chunks == NULL) {
if (chunk != NULL)
rval = chunk;
else
rval = PyUnicode_FromStringAndSize("", 0);
}
else {
APPEND_OLD_CHUNK
rval = join_list_unicode(chunks);
if (rval == NULL) {
goto bail;
}
Py_DECREF(chunk);
Py_CLEAR(chunks);
}

rval = join_list_unicode(chunks);
if (rval == NULL) {
goto bail;
}
Py_DECREF(chunks);
*next_end_ptr = end;
return rval;
bail:
*next_end_ptr = -1;
Py_XDECREF(chunks);
Py_XDECREF(chunk);
return NULL;
}

Expand Down Expand Up @@ -578,6 +595,7 @@ scanner_clear(PyObject *self)
Py_CLEAR(s->parse_float);
Py_CLEAR(s->parse_int);
Py_CLEAR(s->parse_constant);
Py_CLEAR(s->memo);
return 0;
}

Expand All @@ -593,10 +611,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
PyObject *val = NULL;
PyObject *rval = PyList_New(0);
PyObject *rval = NULL;
PyObject *key = NULL;
int strict = PyObject_IsTrue(s->strict);
int has_pairs_hook = (s->object_pairs_hook != Py_None);
Py_ssize_t next_idx;

if (has_pairs_hook)
rval = PyList_New(0);
else
rval = PyDict_New();
if (rval == NULL)
return NULL;

Expand All @@ -606,6 +630,8 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
/* only loop if the object is non-empty */
if (idx <= end_idx && str[idx] != '}') {
while (idx <= end_idx) {
PyObject *memokey;

/* read key */
if (str[idx] != '"') {
raise_errmsg("Expecting property name", pystr, idx);
Expand All @@ -614,6 +640,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
if (key == NULL)
goto bail;
memokey = PyDict_GetItem(s->memo, key);
if (memokey != NULL) {
Py_INCREF(memokey);
Py_DECREF(key);
key = memokey;
}
else {
if (PyDict_SetItem(s->memo, key, key) < 0)
goto bail;
}
idx = next_idx;

/* skip whitespace between key and : delimiter, read :, skip whitespace */
Expand All @@ -630,19 +666,24 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
if (val == NULL)
goto bail;

{
PyObject *tuple = PyTuple_Pack(2, key, val);
if (tuple == NULL)
if (has_pairs_hook) {
PyObject *item = PyTuple_Pack(2, key, val);
if (item == NULL)
goto bail;
if (PyList_Append(rval, tuple) == -1) {
Py_DECREF(tuple);
Py_CLEAR(key);
Py_CLEAR(val);
if (PyList_Append(rval, item) == -1) {
Py_DECREF(item);
goto bail;
}
Py_DECREF(tuple);
Py_DECREF(item);
}
else {
if (PyDict_SetItem(rval, key, val) < 0)
goto bail;
Py_CLEAR(key);
Py_CLEAR(val);
}

Py_CLEAR(key);
Py_CLEAR(val);
idx = next_idx;

/* skip whitespace before } or , */
Expand Down Expand Up @@ -672,36 +713,23 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss

*next_idx_ptr = idx + 1;

if (s->object_pairs_hook != Py_None) {
if (has_pairs_hook) {
val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL);
if (val == NULL)
goto bail;
Py_DECREF(rval);
return val;
}

val = PyDict_New();
if (val == NULL)
goto bail;
if (PyDict_MergeFromSeq2(val, rval, 1) == -1)
goto bail;
Py_DECREF(rval);
rval = val;

/* if object_hook is not None: rval = object_hook(rval) */
if (s->object_hook != Py_None) {
val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
if (val == NULL)
goto bail;
Py_DECREF(rval);
rval = val;
val = NULL;
return val;
}
return rval;
bail:
Py_XDECREF(key);
Py_XDECREF(val);
Py_DECREF(rval);
Py_XDECREF(rval);
return NULL;
}

Expand Down Expand Up @@ -988,6 +1016,9 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds)
Py_TYPE(pystr)->tp_name);
return NULL;
}
PyDict_Clear(s->memo);
if (rval == NULL)
return NULL;
return _build_rval_index_tuple(rval, next_idx);
}

Expand Down Expand Up @@ -1021,6 +1052,12 @@ scanner_init(PyObject *self, PyObject *args, PyObject *kwds)
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
return -1;

if (s->memo == NULL) {
s->memo = PyDict_New();
if (s->memo == NULL)
goto bail;
}

/* All of these will fail "gracefully" so we don't need to verify them */
s->strict = PyObject_GetAttrString(ctx, "strict");
if (s->strict == NULL)
Expand Down

0 comments on commit 7d6e076

Please sign in to comment.