Skip to content

Commit

Permalink
[3.6] bpo-32583: Fix possible crashing in builtin Unicode decoders (G…
Browse files Browse the repository at this point in the history
…H-5325) (#5459)

When using customized decode error handlers, it is possible for builtin decoders
to write out-of-bounds and then crash..
(cherry picked from commit 2c7fd46)
  • Loading branch information
zhangyangyu committed Jan 31, 2018
1 parent eb126ed commit ea94fce
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 2 deletions.
52 changes: 52 additions & 0 deletions Lib/test/test_codeccallbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,58 @@ def mutating(exc):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")

# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot
# so in debug build it can steadily fail
def forward_shorter_than_end(exc):
if isinstance(exc, UnicodeDecodeError):
# size one character, 0 < forward < exc.end
return ('\ufffd', exc.start+1)
else:
raise TypeError("don't know how to handle %r" % exc)
codecs.register_error(
"test.forward_shorter_than_end", forward_shorter_than_end)

self.assertEqual(
b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
'utf-16-le', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
)
self.assertEqual(
b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
'utf-16-be', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
)
self.assertEqual(
b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
'utf-32-le', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\u1111\x00'
)
self.assertEqual(
b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
'utf-32-be', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\u1111\x00'
)

def replace_with_long(exc):
if isinstance(exc, UnicodeDecodeError):
exc.object = b"\x00" * 8
return ('\ufffd', exc.start)
else:
raise TypeError("don't know how to handle %r" % exc)
codecs.register_error("test.replace_with_long", replace_with_long)

self.assertEqual(
b'\x00'.decode('utf-16', 'test.replace_with_long'),
'\ufffd\x00\x00\x00\x00'
)
self.assertEqual(
b'\x00'.decode('utf-32', 'test.replace_with_long'),
'\ufffd\x00\x00'
)


def test_fake_error_class(self):
handlers = [
codecs.strict_errors,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix possible crashing in builtin Unicode decoders caused by write
out-of-bound errors when using customized decode error handlers.
22 changes: 20 additions & 2 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -4429,7 +4429,10 @@ unicode_decode_call_errorhandler_writer(
Py_ssize_t insize;
Py_ssize_t newpos;
Py_ssize_t replen;
Py_ssize_t remain;
PyObject *inputobj = NULL;
int need_to_grow = 0;
const char *new_inptr;

if (*errorHandler == NULL) {
*errorHandler = PyCodec_LookupError(errors);
Expand Down Expand Up @@ -4463,6 +4466,7 @@ unicode_decode_call_errorhandler_writer(
if (!PyBytes_Check(inputobj)) {
PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
}
remain = *inend - *input - *endinpos;
*input = PyBytes_AS_STRING(inputobj);
insize = PyBytes_GET_SIZE(inputobj);
*inend = *input + insize;
Expand All @@ -4482,6 +4486,19 @@ unicode_decode_call_errorhandler_writer(
replen = PyUnicode_GET_LENGTH(repunicode);
if (replen > 1) {
writer->min_length += replen - 1;
need_to_grow = 1;
}
new_inptr = *input + newpos;
if (*inend - new_inptr > remain) {
/* We don't know the decoding algorithm here so we make the worst
assumption that one byte decodes to one unicode character.
If unfortunately one byte could decode to more unicode characters,
the decoder may write out-of-bound then. Is it possible for the
algorithms using this function? */
writer->min_length += *inend - new_inptr - remain;
need_to_grow = 1;
}
if (need_to_grow) {
writer->overallocate = 1;
if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
Expand All @@ -4491,7 +4508,7 @@ unicode_decode_call_errorhandler_writer(
goto onError;

*endinpos = newpos;
*inptr = *input + newpos;
*inptr = new_inptr;

/* we made it! */
Py_XDECREF(restuple);
Expand Down Expand Up @@ -5663,7 +5680,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
#endif

/* Note: size will always be longer than the resulting Unicode
character count */
character count normally. Error handler will take care of
resizing when needed. */
_PyUnicodeWriter_Init(&writer);
writer.min_length = (e - q + 1) / 2;
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Expand Down

0 comments on commit ea94fce

Please sign in to comment.