diff --git a/CHANGES.txt b/CHANGES.txt index d2f7a94..c3e176c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,31 @@ +Version 3.19.0 released 2023-04-06 + +* This release contains security hardening measures based on recommendations + by a security audit sponsored by OSTIF and conducted by X41 D-Sec GmbH. + Several of these measures include changing defaults to be more strict, + by default simplejson will now only consume and produce compliant JSON, + but the flags still exist for any backwards compatibility needs. + No high priority issues were discovered, the reference count + leak is thought to be unreachable since the digits of the float are + checked before PyOS_string_to_double is called. + A link to the public version of this report will be included in a + future release of simplejson. The following fixes were implemented in + one PR: https://github.com/simplejson/simplejson/pull/313 +* Fix invalid handling of unicode escape sequences in the pure Python + implementation of the decoder (SJ-PT-23-01) +* Fix missing reference count decrease if PyOS_string_to_double raises + an exception in Python 2.x; was probably unreachable (SJ-PT-23-02) +* Backport the integer string length limitation from Python 3.11 to + limit quadratic number parsing (SJ-PT-23-03) +* Fix inconsistencies with error messages between the C and Python + implementations (SJ-PT-23-100) +* Remove unused unichr import from encoder (SJ-PT-23-101) +* Remove unused namedtuple_as_object and tuple_as_array arguments from + simplejson.load (SJ-PT-23-102) +* Remove vestigial _one_shot code from iterencode (SJ-PT-23-103) +* Change default of allow_nan from True to False and add allow_nan + to decoder (SJ-PT-23-107) + Version 3.18.4 released 2023-03-14 * Test the sdist to prevent future regressions diff --git a/conf.py b/conf.py index 921bbef..5a2dded 100644 --- a/conf.py +++ b/conf.py @@ -42,9 +42,9 @@ # other places throughout the built documents. # # The short X.Y version. -version = '3.18' +version = '3.19' # The full version, including alpha/beta/rc tags. -release = '3.18.4' +release = '3.19.0' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: diff --git a/index.rst b/index.rst index fbb52b5..f57d650 100644 --- a/index.rst +++ b/index.rst @@ -160,7 +160,7 @@ Basic Usage ----------- .. function:: dump(obj, fp, skipkeys=False, ensure_ascii=True, \ - check_circular=True, allow_nan=True, cls=None, \ + check_circular=True, allow_nan=False, cls=None, \ indent=None, separators=None, encoding='utf-8', \ default=None, use_decimal=True, \ namedtuple_as_object=True, tuple_as_array=True, \ @@ -191,7 +191,7 @@ Basic Usage is highly optimized. .. function:: dumps(obj, skipkeys=False, ensure_ascii=True, \ - check_circular=True, allow_nan=True, cls=None, \ + check_circular=True, allow_nan=False, cls=None, \ indent=None, separators=None, encoding='utf-8', \ default=None, use_decimal=True, \ namedtuple_as_object=True, tuple_as_array=True, \ @@ -225,13 +225,17 @@ Basic Usage reference check for container types will be skipped and a circular reference will result in an :exc:`OverflowError` (or worse). - If *allow_nan* is false (default: ``True``), then it will be a + If *allow_nan* is false (default: ``False``), then it will be a :exc:`ValueError` to serialize out of range :class:`float` values (``nan``, ``inf``, ``-inf``) in strict compliance of the original JSON specification. If *allow_nan* is true, their JavaScript equivalents will be used (``NaN``, ``Infinity``, ``-Infinity``). See also *ignore_nan* for ECMA-262 compliant behavior. + .. versionchanged:: 3.19.0 + The default for *allow_nan* was changed to False for better spec + compliance. + If *indent* is a string, then JSON array elements and object members will be pretty-printed with a newline followed by that string repeated for each level of nesting. ``None`` (the default) selects the most compact @@ -324,7 +328,7 @@ Basic Usage .. function:: load(fp, encoding='utf-8', cls=None, object_hook=None, \ parse_float=None, parse_int=None, \ parse_constant=None, object_pairs_hook=None, \ - use_decimal=None, **kw) + use_decimal=None, allow_nan=False, **kw) Deserialize *fp* (a ``.read()``-supporting file-like object containing a JSON document) to a Python object using this @@ -367,7 +371,7 @@ Basic Usage .. function:: loads(s, encoding='utf-8', cls=None, object_hook=None, \ parse_float=None, parse_int=None, \ parse_constant=None, object_pairs_hook=None, \ - use_decimal=None, **kw) + use_decimal=None, allow_nan=False, **kw) Deserialize *s* (a :class:`str` or :class:`unicode` instance containing a JSON document) to a Python object. :exc:`JSONDecodeError` will be @@ -412,9 +416,12 @@ Basic Usage be used to use another datatype or parser for JSON integers (e.g. :class:`float`). - *parse_constant*, if specified, will be called with one of the following - strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This can be used to - raise an exception if invalid JSON numbers are encountered. + .. versionchanged:: 3.19.0 + The integer to string conversion length limitation introduced in + Python 3.11 has been backported. An attempt to parse an integer + with more than 4300 digits will result in an exception unless a + suitable alternative parser is specified + (e.g. :class:`decimal.Decimal`) If *use_decimal* is true (default: ``False``) then *parse_float* is set to :class:`decimal.Decimal`. This is a convenience for parity with the @@ -436,12 +443,28 @@ Basic Usage Subclassing is not recommended. You should use *object_hook* or *object_pairs_hook*. This is faster and more portable than subclassing. + + *allow_nan*, if True (default false), will allow the parser to + accept the non-standard floats + ``NaN``, ``Infinity``, and ``-Infinity``. + + .. versionchanged:: 3.19.0 + + This argument was added to make it possible to use the legacy behavior + now that the parser is more strict about compliance to the standard. + + *parse_constant*, if specified, will be + called with one of the following strings: ``'-Infinity'``, + ``'Infinity'``, ``'NaN'``. It is not recommended to use this feature, + as it is rare to parse non-compliant JSON containing these values. + + Encoders and decoders --------------------- .. class:: JSONDecoder(encoding='utf-8', object_hook=None, parse_float=None, \ parse_int=None, parse_constant=None, \ - object_pairs_hook=None, strict=True) + object_pairs_hook=None, strict=True, allow_nan=False) Simple JSON decoder. @@ -469,7 +492,8 @@ Encoders and decoders | null | None | None | +---------------+-----------+-----------+ - It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as their + When *allow_nan* is True, it also understands + ``NaN``, ``Infinity``, and ``-Infinity`` as their corresponding ``float`` values, which is outside the JSON spec. *encoding* determines the encoding used to interpret any :class:`str` objects @@ -502,15 +526,31 @@ Encoders and decoders be used to use another datatype or parser for JSON integers (e.g. :class:`float`). - *parse_constant*, if specified, will be called with one of the following - strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This can be used to - raise an exception if invalid JSON numbers are encountered. + .. versionchanged:: 3.19.0 + The integer to string conversion length limitation introduced in + Python 3.11 has been backported. An attempt to parse an integer + with more than 4300 digits will result in an exception unless a + suitable alternative parser is specified + (e.g. :class:`decimal.Decimal`) + + *parse_constant*, if specified, will be + called with one of the following strings: ``'-Infinity'``, + ``'Infinity'``, ``'NaN'``. It is not recommended to use this feature, + as it is rare to parse non-compliant JSON containing these values. *strict* controls the parser's behavior when it encounters an invalid control character in a string. The default setting of ``True`` means that unescaped control characters are parse errors, if ``False`` then control characters will be allowed in strings. + *allow_nan* when True (not the default), the decoder will allow + ``NaN``, ``Infinity``, and ``-Infinity`` as their corresponding floats. + + .. versionchanged:: 3.19.0 + This argument was added to make it behave closer to the spec by + default. The previous behavior can be restored by setting this to + False. + .. method:: decode(s) Return the Python representation of the JSON document *s*. See @@ -532,7 +572,7 @@ Encoders and decoders document is not valid. .. class:: JSONEncoder(skipkeys=False, ensure_ascii=True, \ - check_circular=True, allow_nan=True, sort_keys=False, \ + check_circular=True, allow_nan=False, sort_keys=False, \ indent=None, separators=None, encoding='utf-8', \ default=None, use_decimal=True, \ namedtuple_as_object=True, tuple_as_array=True, \ @@ -573,7 +613,8 @@ Encoders and decoders wrapped in another type with an appropriate `for_json` method to transform the keys during encoding. - It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as their + When *allow_nan* is True, it also understands + ``NaN``, ``Infinity``, and ``-Infinity`` as their corresponding ``float`` values, which is outside the JSON spec. To extend this to recognize other objects, subclass and implement a @@ -599,12 +640,16 @@ Encoders and decoders prevent an infinite recursion (which would cause an :exc:`OverflowError`). Otherwise, no such check takes place. - If *allow_nan* is true (the default), then ``NaN``, ``Infinity``, and + If *allow_nan* is true (not the default), then ``NaN``, ``Infinity``, and ``-Infinity`` will be encoded as such. This behavior is not JSON - specification compliant, but is consistent with most JavaScript based - encoders and decoders. Otherwise, it will be a :exc:`ValueError` to encode + specification compliant. Otherwise, it will be a :exc:`ValueError` to encode such floats. See also *ignore_nan* for ECMA-262 compliant behavior. + .. versionchanged:: 3.19.0 + This default is now False to make it behave closer to the spec. + The previous behavior can be restored by setting this to + False. + If *sort_keys* is true (not the default), then the output of dictionaries will be sorted by key; this is useful for regression tests to ensure that JSON serializations can be compared on a day-to-day basis. @@ -716,7 +761,7 @@ Encoders and decoders :meth:`iterencode`. .. class:: JSONEncoderForHTML(skipkeys=False, ensure_ascii=True, \ - check_circular=True, allow_nan=True, \ + check_circular=True, allow_nan=False, \ sort_keys=False, indent=None, separators=None, \ encoding='utf-8', \ default=None, use_decimal=True, \ @@ -826,22 +871,28 @@ Infinite and NaN Number Values The RFC does not permit the representation of infinite or NaN number values. Despite that, by default, this module accepts and outputs ``Infinity``, -``-Infinity``, and ``NaN`` as if they were valid JSON number literal values:: +``-Infinity``, and ``NaN`` as if they were valid JSON number literal values +if the allow_nan flag is enabled:: >>> # Neither of these calls raises an exception, but the results are not valid JSON - >>> json.dumps(float('-inf')) + >>> json.dumps(float('-inf'), allow_nan=True) '-Infinity' - >>> json.dumps(float('nan')) + >>> json.dumps(float('nan'), allow_nan=True) 'NaN' >>> # Same when deserializing - >>> json.loads('-Infinity') + >>> json.loads('-Infinity', allow_nan=True) -inf - >>> json.loads('NaN') + >>> json.loads('NaN', allow_nan=True) nan + >>> # ignore_nan uses the ECMA-262 behavior to serialize these as null + >>> json.dumps(float('-inf'), ignore_nan=True) + 'null' + >>> json.dumps(float('nan'), ignore_nan=True) + 'null' In the serializer, the *allow_nan* parameter can be used to alter this -behavior. In the deserializer, the *parse_constant* parameter can be used to -alter this behavior. +behavior. In the deserializer, the *allow_nan* and +*parse_constant* parameters can be used to alter this behavior. Repeated Names Within an Object diff --git a/simplejson/__init__.py b/simplejson/__init__.py index 47e49a3..206e22d 100644 --- a/simplejson/__init__.py +++ b/simplejson/__init__.py @@ -118,7 +118,7 @@ """ from __future__ import absolute_import -__version__ = '3.18.4' +__version__ = '3.19.0' __all__ = [ 'dump', 'dumps', 'load', 'loads', 'JSONDecoder', 'JSONDecodeError', 'JSONEncoder', @@ -149,28 +149,10 @@ def _import_c_make_encoder(): except ImportError: return None -_default_encoder = JSONEncoder( - skipkeys=False, - ensure_ascii=True, - check_circular=True, - allow_nan=True, - indent=None, - separators=None, - encoding='utf-8', - default=None, - use_decimal=True, - namedtuple_as_object=True, - tuple_as_array=True, - iterable_as_array=False, - bigint_as_string=False, - item_sort_key=None, - for_json=False, - ignore_nan=False, - int_as_string_bitcount=None, -) +_default_encoder = JSONEncoder() def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, - allow_nan=True, cls=None, indent=None, separators=None, + allow_nan=False, cls=None, indent=None, separators=None, encoding='utf-8', default=None, use_decimal=True, namedtuple_as_object=True, tuple_as_array=True, bigint_as_string=False, sort_keys=False, item_sort_key=None, @@ -187,10 +169,10 @@ def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, contain non-ASCII characters, so long as they do not need to be escaped by JSON. When it is true, all non-ASCII characters are escaped. - If *allow_nan* is false, then it will be a ``ValueError`` to - serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) - in strict compliance of the original JSON specification, instead of using - the JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``). See + If *allow_nan* is true (default: ``False``), then out of range ``float`` + values (``nan``, ``inf``, ``-inf``) will be serialized to + their JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``) + instead of raising a ValueError. See *ignore_nan* for ECMA-262 compliant behavior. If *indent* is a string, then JSON array elements and object members @@ -258,7 +240,7 @@ def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, """ # cached encoder if (not skipkeys and ensure_ascii and - check_circular and allow_nan and + check_circular and not allow_nan and cls is None and indent is None and separators is None and encoding == 'utf-8' and default is None and use_decimal and namedtuple_as_object and tuple_as_array and not iterable_as_array @@ -292,7 +274,7 @@ def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, - allow_nan=True, cls=None, indent=None, separators=None, + allow_nan=False, cls=None, indent=None, separators=None, encoding='utf-8', default=None, use_decimal=True, namedtuple_as_object=True, tuple_as_array=True, bigint_as_string=False, sort_keys=False, item_sort_key=None, @@ -312,10 +294,11 @@ def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, for container types will be skipped and a circular reference will result in an ``OverflowError`` (or worse). - If ``allow_nan`` is false, then it will be a ``ValueError`` to - serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in - strict compliance of the JSON specification, instead of using the - JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``). + If *allow_nan* is true (default: ``False``), then out of range ``float`` + values (``nan``, ``inf``, ``-inf``) will be serialized to + their JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``) + instead of raising a ValueError. See + *ignore_nan* for ECMA-262 compliant behavior. If ``indent`` is a string, then JSON array elements and object members will be pretty-printed with a newline followed by that string repeated @@ -383,7 +366,7 @@ def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, """ # cached encoder if (not skipkeys and ensure_ascii and - check_circular and allow_nan and + check_circular and not allow_nan and cls is None and indent is None and separators is None and encoding == 'utf-8' and default is None and use_decimal and namedtuple_as_object and tuple_as_array and not iterable_as_array @@ -412,14 +395,12 @@ def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, **kw).encode(obj) -_default_decoder = JSONDecoder(encoding=None, object_hook=None, - object_pairs_hook=None) +_default_decoder = JSONDecoder() def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, - use_decimal=False, namedtuple_as_object=True, tuple_as_array=True, - **kw): + use_decimal=False, allow_nan=False, **kw): """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing a JSON document as `str` or `bytes`) to a Python object. @@ -442,23 +423,27 @@ def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None, takes priority. *parse_float*, if specified, will be called with the string of every - JSON float to be decoded. By default, this is equivalent to + JSON float to be decoded. By default, this is equivalent to ``float(num_str)``. This can be used to use another datatype or parser for JSON floats (e.g. :class:`decimal.Decimal`). *parse_int*, if specified, will be called with the string of every - JSON int to be decoded. By default, this is equivalent to + JSON int to be decoded. By default, this is equivalent to ``int(num_str)``. This can be used to use another datatype or parser for JSON integers (e.g. :class:`float`). - *parse_constant*, if specified, will be called with one of the - following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This - can be used to raise an exception if invalid JSON numbers are - encountered. + *allow_nan*, if True (default false), will allow the parser to + accept the non-standard floats ``NaN``, ``Infinity``, and ``-Infinity`` + and enable the use of the deprecated *parse_constant*. If *use_decimal* is true (default: ``False``) then it implies parse_float=decimal.Decimal for parity with ``dump``. + *parse_constant*, if specified, will be + called with one of the following strings: ``'-Infinity'``, + ``'Infinity'``, ``'NaN'``. It is not recommended to use this feature, + as it is rare to parse non-compliant JSON containing these values. + To use a custom ``JSONDecoder`` subclass, specify it with the ``cls`` kwarg. NOTE: You should use *object_hook* or *object_pairs_hook* instead of subclassing whenever possible. @@ -468,12 +453,12 @@ def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None, encoding=encoding, cls=cls, object_hook=object_hook, parse_float=parse_float, parse_int=parse_int, parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, - use_decimal=use_decimal, **kw) + use_decimal=use_decimal, allow_nan=allow_nan, **kw) def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, - use_decimal=False, **kw): + use_decimal=False, allow_nan=False, **kw): """Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON document) to a Python object. @@ -505,14 +490,18 @@ def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None, ``int(num_str)``. This can be used to use another datatype or parser for JSON integers (e.g. :class:`float`). - *parse_constant*, if specified, will be called with one of the - following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This - can be used to raise an exception if invalid JSON numbers are - encountered. + *allow_nan*, if True (default false), will allow the parser to + accept the non-standard floats ``NaN``, ``Infinity``, and ``-Infinity`` + and enable the use of the deprecated *parse_constant*. If *use_decimal* is true (default: ``False``) then it implies parse_float=decimal.Decimal for parity with ``dump``. + *parse_constant*, if specified, will be + called with one of the following strings: ``'-Infinity'``, + ``'Infinity'``, ``'NaN'``. It is not recommended to use this feature, + as it is rare to parse non-compliant JSON containing these values. + To use a custom ``JSONDecoder`` subclass, specify it with the ``cls`` kwarg. NOTE: You should use *object_hook* or *object_pairs_hook* instead of subclassing whenever possible. @@ -521,7 +510,7 @@ def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None, if (cls is None and encoding is None and object_hook is None and parse_int is None and parse_float is None and parse_constant is None and object_pairs_hook is None - and not use_decimal and not kw): + and not use_decimal and not allow_nan and not kw): return _default_decoder.decode(s) if cls is None: cls = JSONDecoder @@ -539,6 +528,8 @@ def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None, if parse_float is not None: raise TypeError("use_decimal=True implies parse_float=Decimal") kw['parse_float'] = Decimal + if allow_nan: + kw['allow_nan'] = True return cls(encoding=encoding, **kw).decode(s) @@ -560,22 +551,9 @@ def _toggle_speedups(enabled): scan.make_scanner = scan.py_make_scanner dec.make_scanner = scan.make_scanner global _default_decoder - _default_decoder = JSONDecoder( - encoding=None, - object_hook=None, - object_pairs_hook=None, - ) + _default_decoder = JSONDecoder() global _default_encoder - _default_encoder = JSONEncoder( - skipkeys=False, - ensure_ascii=True, - check_circular=True, - allow_nan=True, - indent=None, - separators=None, - encoding='utf-8', - default=None, - ) + _default_encoder = JSONEncoder() def simple_first(kv): """Helper function to pass to item_sort_key to sort simple diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c index ec054c7..bd56b4d 100644 --- a/simplejson/_speedups.c +++ b/simplejson/_speedups.c @@ -1843,7 +1843,7 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi } static PyObject * -_parse_constant(PyScannerObject *s, PyObject *constant, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) +_parse_constant(PyScannerObject *s, PyObject *pystr, PyObject *constant, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { /* Read a JSON constant from PyString pystr. constant is the Python string that was found @@ -1855,6 +1855,10 @@ _parse_constant(PyScannerObject *s, PyObject *constant, Py_ssize_t idx, Py_ssize Returns the result of parse_constant */ PyObject *rval; + if (s->parse_constant == Py_None) { + raise_errmsg(ERR_EXPECTING_VALUE, pystr, idx); + return NULL; + } /* rval = parse_constant(constant) */ rval = PyObject_CallOneArg(s->parse_constant, constant); @@ -1886,7 +1890,7 @@ _match_number_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssiz /* read a sign if it's there, make sure it's not the end of the string */ if (str[idx] == '-') { if (idx >= end_idx) { - raise_errmsg(ERR_EXPECTING_VALUE, pystr, idx); + raise_errmsg(ERR_EXPECTING_VALUE, pystr, start); return NULL; } idx++; @@ -1903,7 +1907,7 @@ _match_number_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssiz } /* no integer digits, error */ else { - raise_errmsg(ERR_EXPECTING_VALUE, pystr, idx); + raise_errmsg(ERR_EXPECTING_VALUE, pystr, start); return NULL; } @@ -1949,8 +1953,10 @@ _match_number_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssiz /* rval = PyFloat_FromDouble(PyOS_ascii_atof(PyString_AS_STRING(numstr))); */ double d = PyOS_string_to_double(PyString_AS_STRING(numstr), NULL, NULL); - if (d == -1.0 && PyErr_Occurred()) + if (d == -1.0 && PyErr_Occurred()) { + Py_DECREF(numstr); return NULL; + } rval = PyFloat_FromDouble(d); } } @@ -1993,7 +1999,7 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ /* read a sign if it's there, make sure it's not the end of the string */ if (PyUnicode_READ(kind, str, idx) == '-') { if (idx >= end_idx) { - raise_errmsg(ERR_EXPECTING_VALUE, pystr, idx); + raise_errmsg(ERR_EXPECTING_VALUE, pystr, start); return NULL; } idx++; @@ -2013,7 +2019,7 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ } else { /* no integer digits, error */ - raise_errmsg(ERR_EXPECTING_VALUE, pystr, idx); + raise_errmsg(ERR_EXPECTING_VALUE, pystr, start); return NULL; } @@ -2156,7 +2162,7 @@ scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *n case 'N': /* NaN */ if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') { - rval = _parse_constant(s, JSON_NaN, idx, next_idx_ptr); + rval = _parse_constant(s, pystr, JSON_NaN, idx, next_idx_ptr); } else fallthrough = 1; @@ -2164,7 +2170,7 @@ scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *n case 'I': /* Infinity */ if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') { - rval = _parse_constant(s, JSON_Infinity, idx, next_idx_ptr); + rval = _parse_constant(s, pystr, JSON_Infinity, idx, next_idx_ptr); } else fallthrough = 1; @@ -2172,7 +2178,7 @@ scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *n case '-': /* -Infinity */ if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') { - rval = _parse_constant(s, JSON_NegInfinity, idx, next_idx_ptr); + rval = _parse_constant(s, pystr, JSON_NegInfinity, idx, next_idx_ptr); } else fallthrough = 1; @@ -2275,7 +2281,7 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ if ((idx + 2 < length) && PyUnicode_READ(kind, str, idx + 1) == 'a' && PyUnicode_READ(kind, str, idx + 2) == 'N') { - rval = _parse_constant(s, JSON_NaN, idx, next_idx_ptr); + rval = _parse_constant(s, pystr, JSON_NaN, idx, next_idx_ptr); } else fallthrough = 1; @@ -2290,7 +2296,7 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ PyUnicode_READ(kind, str, idx + 5) == 'i' && PyUnicode_READ(kind, str, idx + 6) == 't' && PyUnicode_READ(kind, str, idx + 7) == 'y') { - rval = _parse_constant(s, JSON_Infinity, idx, next_idx_ptr); + rval = _parse_constant(s, pystr, JSON_Infinity, idx, next_idx_ptr); } else fallthrough = 1; @@ -2306,7 +2312,7 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ PyUnicode_READ(kind, str, idx + 6) == 'i' && PyUnicode_READ(kind, str, idx + 7) == 't' && PyUnicode_READ(kind, str, idx + 8) == 'y') { - rval = _parse_constant(s, JSON_NegInfinity, idx, next_idx_ptr); + rval = _parse_constant(s, pystr, JSON_NegInfinity, idx, next_idx_ptr); } else fallthrough = 1; diff --git a/simplejson/decoder.py b/simplejson/decoder.py index 1a8f772..c99a976 100644 --- a/simplejson/decoder.py +++ b/simplejson/decoder.py @@ -46,9 +46,35 @@ def _floatconstants(): DEFAULT_ENCODING = "utf-8" +if hasattr(sys, 'get_int_max_str_digits'): + bounded_int = int +else: + def bounded_int(s, INT_MAX_STR_DIGITS=4300): + """Backport of the integer string length conversion limitation + + https://docs.python.org/3/library/stdtypes.html#int-max-str-digits + """ + if len(s) > INT_MAX_STR_DIGITS: + raise ValueError("Exceeds the limit (%s) for integer string conversion: value has %s digits" % (INT_MAX_STR_DIGITS, len(s))) + return int(s) + + +def scan_four_digit_hex(s, end, _m=re.compile(r'^[0-9a-fA-F]{4}$').match): + """Scan a four digit hex number from s[end:end + 4] + """ + msg = "Invalid \\uXXXX escape sequence" + esc = s[end:end + 4] + if not _m(esc): + raise JSONDecodeError(msg, s, end - 2) + try: + return int(esc, 16), end + 4 + except ValueError: + raise JSONDecodeError(msg, s, end - 2) + def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match, _join=u''.join, - _PY3=PY3, _maxunicode=sys.maxunicode): + _PY3=PY3, _maxunicode=sys.maxunicode, + _scan_four_digit_hex=scan_four_digit_hex): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError @@ -67,6 +93,7 @@ def py_scanstring(s, end, encoding=None, strict=True, if chunk is None: raise JSONDecodeError( "Unterminated string starting at", s, begin) + prev_end = end end = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters @@ -81,7 +108,7 @@ def py_scanstring(s, end, encoding=None, strict=True, elif terminator != '\\': if strict: msg = "Invalid control character %r at" - raise JSONDecodeError(msg, s, end) + raise JSONDecodeError(msg, s, prev_end) else: _append(terminator) continue @@ -100,35 +127,18 @@ def py_scanstring(s, end, encoding=None, strict=True, end += 1 else: # Unicode escape sequence - msg = "Invalid \\uXXXX escape sequence" - esc = s[end + 1:end + 5] - escX = esc[1:2] - if len(esc) != 4 or escX == 'x' or escX == 'X': - raise JSONDecodeError(msg, s, end - 1) - try: - uni = int(esc, 16) - except ValueError: - raise JSONDecodeError(msg, s, end - 1) - if uni < 0 or uni > _maxunicode: - raise JSONDecodeError(msg, s, end - 1) - end += 5 + uni, end = _scan_four_digit_hex(s, end + 1) # Check for surrogate pair on UCS-4 systems # Note that this will join high/low surrogate pairs # but will also pass unpaired surrogates through if (_maxunicode > 65535 and uni & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u'): - esc2 = s[end + 2:end + 6] - escX = esc2[1:2] - if len(esc2) == 4 and not (escX == 'x' or escX == 'X'): - try: - uni2 = int(esc2, 16) - except ValueError: - raise JSONDecodeError(msg, s, end) - if uni2 & 0xfc00 == 0xdc00: - uni = 0x10000 + (((uni - 0xd800) << 10) | - (uni2 - 0xdc00)) - end += 6 + uni2, end2 = _scan_four_digit_hex(s, end + 2) + if uni2 & 0xfc00 == 0xdc00: + uni = 0x10000 + (((uni - 0xd800) << 10) | + (uni2 - 0xdc00)) + end = end2 char = unichr(uni) # Append the unescaped character _append(char) @@ -169,7 +179,7 @@ def JSONObject(state, encoding, strict, scan_once, object_hook, return pairs, end + 1 elif nextchar != '"': raise JSONDecodeError( - "Expecting property name enclosed in double quotes", + "Expecting property name enclosed in double quotes or '}'", s, end) end += 1 while True: @@ -296,14 +306,15 @@ class JSONDecoder(object): | null | None | +---------------+-------------------+ - It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as + When allow_nan=True, it also understands + ``NaN``, ``Infinity``, and ``-Infinity`` as their corresponding ``float`` values, which is outside the JSON spec. """ def __init__(self, encoding=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, strict=True, - object_pairs_hook=None): + object_pairs_hook=None, allow_nan=False): """ *encoding* determines the encoding used to interpret any :class:`str` objects decoded by this instance (``'utf-8'`` by @@ -336,10 +347,13 @@ def __init__(self, encoding=None, object_hook=None, parse_float=None, ``int(num_str)``. This can be used to use another datatype or parser for JSON integers (e.g. :class:`float`). - *parse_constant*, if specified, will be called with one of the - following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This - can be used to raise an exception if invalid JSON numbers are - encountered. + *allow_nan*, if True (default false), will allow the parser to + accept the non-standard floats ``NaN``, ``Infinity``, and ``-Infinity``. + + *parse_constant*, if specified, will be + called with one of the following strings: ``'-Infinity'``, + ``'Infinity'``, ``'NaN'``. It is not recommended to use this feature, + as it is rare to parse non-compliant JSON containing these values. *strict* controls the parser's behavior when it encounters an invalid control character in a string. The default setting of @@ -353,8 +367,8 @@ def __init__(self, encoding=None, object_hook=None, parse_float=None, self.object_hook = object_hook self.object_pairs_hook = object_pairs_hook self.parse_float = parse_float or float - self.parse_int = parse_int or int - self.parse_constant = parse_constant or _CONSTANTS.__getitem__ + self.parse_int = parse_int or bounded_int + self.parse_constant = parse_constant or (allow_nan and _CONSTANTS.__getitem__ or None) self.strict = strict self.parse_object = JSONObject self.parse_array = JSONArray diff --git a/simplejson/encoder.py b/simplejson/encoder.py index e93fe43..661ff36 100644 --- a/simplejson/encoder.py +++ b/simplejson/encoder.py @@ -5,7 +5,7 @@ from operator import itemgetter # Do not import Decimal directly to avoid reload issues import decimal -from .compat import unichr, binary_type, text_type, string_types, integer_types, PY3 +from .compat import binary_type, text_type, string_types, integer_types, PY3 def _import_speedups(): try: from . import _speedups @@ -140,7 +140,7 @@ class JSONEncoder(object): key_separator = ': ' def __init__(self, skipkeys=False, ensure_ascii=True, - check_circular=True, allow_nan=True, sort_keys=False, + check_circular=True, allow_nan=False, sort_keys=False, indent=None, separators=None, encoding='utf-8', default=None, use_decimal=True, namedtuple_as_object=True, tuple_as_array=True, bigint_as_string=False, @@ -161,10 +161,11 @@ def __init__(self, skipkeys=False, ensure_ascii=True, prevent an infinite recursion (which would cause an OverflowError). Otherwise, no such check takes place. - If allow_nan is true, then NaN, Infinity, and -Infinity will be - encoded as such. This behavior is not JSON specification compliant, - but is consistent with most JavaScript based encoders and decoders. - Otherwise, it will be a ValueError to encode such floats. + If allow_nan is true (default: False), then out of range float + values (nan, inf, -inf) will be serialized to + their JavaScript equivalents (NaN, Infinity, -Infinity) + instead of raising a ValueError. See + ignore_nan for ECMA-262 compliant behavior. If sort_keys is true, then the output of dictionaries will be sorted by key; this is useful for regression tests to ensure @@ -294,7 +295,7 @@ def encode(self, o): # This doesn't pass the iterator directly to ''.join() because the # exceptions aren't as detailed. The list call should be roughly # equivalent to the PySequence_Fast that ''.join() would do. - chunks = self.iterencode(o, _one_shot=True) + chunks = self.iterencode(o) if not isinstance(chunks, (list, tuple)): chunks = list(chunks) if self.ensure_ascii: @@ -302,7 +303,7 @@ def encode(self, o): else: return u''.join(chunks) - def iterencode(self, o, _one_shot=False): + def iterencode(self, o): """Encode the given object and yield each string representation as available. @@ -356,8 +357,7 @@ def floatstr(o, allow_nan=self.allow_nan, ignore_nan=self.ignore_nan, key_memo = {} int_as_string_bitcount = ( 53 if self.bigint_as_string else self.int_as_string_bitcount) - if (_one_shot and c_make_encoder is not None - and self.indent is None): + if (c_make_encoder is not None and self.indent is None): _iterencode = c_make_encoder( markers, self.default, _encoder, self.indent, self.key_separator, self.item_separator, self.sort_keys, @@ -370,7 +370,7 @@ def floatstr(o, allow_nan=self.allow_nan, ignore_nan=self.ignore_nan, _iterencode = _make_iterencode( markers, self.default, _encoder, self.indent, floatstr, self.key_separator, self.item_separator, self.sort_keys, - self.skipkeys, _one_shot, self.use_decimal, + self.skipkeys, self.use_decimal, self.namedtuple_as_object, self.tuple_as_array, int_as_string_bitcount, self.item_sort_key, self.encoding, self.for_json, @@ -398,14 +398,14 @@ class JSONEncoderForHTML(JSONEncoder): def encode(self, o): # Override JSONEncoder.encode because it has hacks for # performance that make things more complicated. - chunks = self.iterencode(o, True) + chunks = self.iterencode(o) if self.ensure_ascii: return ''.join(chunks) else: return u''.join(chunks) - def iterencode(self, o, _one_shot=False): - chunks = super(JSONEncoderForHTML, self).iterencode(o, _one_shot) + def iterencode(self, o): + chunks = super(JSONEncoderForHTML, self).iterencode(o) for chunk in chunks: chunk = chunk.replace('&', '\\u0026') chunk = chunk.replace('<', '\\u003c') @@ -419,7 +419,7 @@ def iterencode(self, o, _one_shot=False): def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, - _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot, + _key_separator, _item_separator, _sort_keys, _skipkeys, _use_decimal, _namedtuple_as_object, _tuple_as_array, _int_as_string_bitcount, _item_sort_key, _encoding,_for_json, diff --git a/simplejson/scanner.py b/simplejson/scanner.py index 85e385e..34710d6 100644 --- a/simplejson/scanner.py +++ b/simplejson/scanner.py @@ -60,11 +60,11 @@ def _scan_once(string, idx): else: res = parse_int(integer) return res, m.end() - elif nextchar == 'N' and string[idx:idx + 3] == 'NaN': + elif parse_constant and nextchar == 'N' and string[idx:idx + 3] == 'NaN': return parse_constant('NaN'), idx + 3 - elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity': + elif parse_constant and nextchar == 'I' and string[idx:idx + 8] == 'Infinity': return parse_constant('Infinity'), idx + 8 - elif nextchar == '-' and string[idx:idx + 9] == '-Infinity': + elif parse_constant and nextchar == '-' and string[idx:idx + 9] == '-Infinity': return parse_constant('-Infinity'), idx + 9 else: raise JSONDecodeError(errmsg, string, idx) diff --git a/simplejson/tests/test_decode.py b/simplejson/tests/test_decode.py index 6960ee5..317b4f9 100644 --- a/simplejson/tests/test_decode.py +++ b/simplejson/tests/test_decode.py @@ -2,6 +2,7 @@ import decimal from unittest import TestCase +import sys import simplejson as json from simplejson.compat import StringIO, b, binary_type from simplejson import OrderedDict @@ -117,3 +118,10 @@ def test_bounds_checking(self): diff = id(x) - id(y) self.assertRaises(ValueError, j.scan_once, y, diff) self.assertRaises(ValueError, j.raw_decode, y, i) + + def test_bounded_int(self): + # SJ-PT-23-03, limit quadratic number parsing per Python 3.11 + max_str_digits = getattr(sys, 'get_int_max_str_digits', lambda: 4300)() + s = '1' + '0' * (max_str_digits - 1) + self.assertEqual(json.loads(s), int(s)) + self.assertRaises(ValueError, json.loads, s + '0') diff --git a/simplejson/tests/test_fail.py b/simplejson/tests/test_fail.py index 788f3a5..5f9a8f6 100644 --- a/simplejson/tests/test_fail.py +++ b/simplejson/tests/test_fail.py @@ -145,7 +145,7 @@ def test_truncated_input(self): ('["spam', 'Unterminated string starting at', 1), ('["spam"', "Expecting ',' delimiter", 7), ('["spam",', 'Expecting value', 8), - ('{', 'Expecting property name enclosed in double quotes', 1), + ('{', "Expecting property name enclosed in double quotes or '}'", 1), ('{"', 'Unterminated string starting at', 1), ('{"spam', 'Unterminated string starting at', 1), ('{"spam"', "Expecting ':' delimiter", 7), @@ -156,6 +156,8 @@ def test_truncated_input(self): ('"', 'Unterminated string starting at', 0), ('"spam', 'Unterminated string starting at', 0), ('[,', "Expecting value", 1), + ('--', 'Expecting value', 0), + ('"\x18d', "Invalid control character %r", 1), ] for data, msg, idx in test_cases: try: diff --git a/simplejson/tests/test_float.py b/simplejson/tests/test_float.py index e382ec2..a977969 100644 --- a/simplejson/tests/test_float.py +++ b/simplejson/tests/test_float.py @@ -7,9 +7,9 @@ class TestFloat(TestCase): def test_degenerates_allow(self): for inf in (PosInf, NegInf): - self.assertEqual(json.loads(json.dumps(inf)), inf) + self.assertEqual(json.loads(json.dumps(inf, allow_nan=True), allow_nan=True), inf) # Python 2.5 doesn't have math.isnan - nan = json.loads(json.dumps(NaN)) + nan = json.loads(json.dumps(NaN, allow_nan=True), allow_nan=True) self.assertTrue((0 + nan) != nan) def test_degenerates_ignore(self): @@ -19,6 +19,9 @@ def test_degenerates_ignore(self): def test_degenerates_deny(self): for f in (PosInf, NegInf, NaN): self.assertRaises(ValueError, json.dumps, f, allow_nan=False) + for s in ('Infinity', '-Infinity', 'NaN'): + self.assertRaises(ValueError, json.loads, s, allow_nan=False) + self.assertRaises(ValueError, json.loads, s) def test_floats(self): for num in [1617161771.7650001, math.pi, math.pi**100, diff --git a/simplejson/tests/test_scanstring.py b/simplejson/tests/test_scanstring.py index c6c53b8..1f54483 100644 --- a/simplejson/tests/test_scanstring.py +++ b/simplejson/tests/test_scanstring.py @@ -132,7 +132,9 @@ def _test_scanstring(self, scanstring): self.assertRaises(ValueError, scanstring, '\\ud834\\x0123"', 0, None, True) - self.assertRaises(json.JSONDecodeError, scanstring, "\\u-123", 0, None, True) + self.assertRaises(json.JSONDecodeError, scanstring, '\\u-123"', 0, None, True) + # SJ-PT-23-01: Invalid Handling of Broken Unicode Escape Sequences + self.assertRaises(json.JSONDecodeError, scanstring, '\\u EDD"', 0, None, True) def test_issue3623(self): self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,