Skip to content

Commit

Permalink
bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicod…
Browse files Browse the repository at this point in the history
…e-escape" codec (GH-28944) (GH-28953)

They support now splitting escape sequences between input chunks.

Add the third parameter "final" in codecs.raw_unicode_escape_decode().
It is True by default to match the former behavior.

(cherry picked from commit 39aa983)
  • Loading branch information
serhiy-storchaka committed Oct 14, 2021
1 parent 7c722e3 commit 6848602
Show file tree
Hide file tree
Showing 8 changed files with 443 additions and 351 deletions.
640 changes: 324 additions & 316 deletions Doc/data/python3.9.abi

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions Include/cpython/unicodeobject.h
Expand Up @@ -888,6 +888,14 @@ Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Py_ssize_t length /* Number of Py_UNICODE chars to encode */
);

/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed /* bytes consumed */
);

/* --- Latin-1 Codecs ----------------------------------------------------- */

PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
Expand Down
9 changes: 5 additions & 4 deletions Lib/encodings/raw_unicode_escape.py
Expand Up @@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.raw_unicode_escape_encode(input, self.errors)[0]

class IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
return codecs.raw_unicode_escape_decode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
return codecs.raw_unicode_escape_decode(input, errors, final)

class StreamWriter(Codec,codecs.StreamWriter):
pass

class StreamReader(Codec,codecs.StreamReader):
pass
def decode(self, input, errors='strict'):
return codecs.raw_unicode_escape_decode(input, errors, False)

### encodings module API

Expand Down
35 changes: 34 additions & 1 deletion Lib/test/test_codecs.py
Expand Up @@ -2457,7 +2457,11 @@ def test_partial(self):
]
)

class RawUnicodeEscapeTest(unittest.TestCase):
class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
encoding = "raw-unicode-escape"

test_lone_surrogates = None

def test_empty(self):
self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
Expand Down Expand Up @@ -2506,6 +2510,35 @@ def test_decode_errors(self):
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))

def test_partial(self):
self.check_partial(
"\x00\t\n\r\\\xff\uffff\U00010000",
[
'\x00',
'\x00\t',
'\x00\t\n',
'\x00\t\n\r',
'\x00\t\n\r',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff\U00010000',
]
)


class EscapeEncodeTest(unittest.TestCase):

Expand Down
@@ -0,0 +1,2 @@
Fix incremental decoder and stream reader in the "raw-unicode-escape" codec.
Previously they failed if the escape sequence was split.
13 changes: 8 additions & 5 deletions Modules/_codecsmodule.c
Expand Up @@ -507,17 +507,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
_codecs.raw_unicode_escape_decode
data: Py_buffer(accept={str, buffer})
errors: str(accept={str, NoneType}) = None
final: bool(accept={int}) = True
/
[clinic start generated code]*/

static PyObject *
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
const char *errors)
/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/
const char *errors, int final)
/*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/
{
PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
errors);
return codec_tuple(decoded, data->len);
Py_ssize_t consumed = data->len;
PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
errors,
final ? NULL : &consumed);
return codec_tuple(decoded, consumed);
}

/*[clinic input]
Expand Down
23 changes: 18 additions & 5 deletions Modules/clinic/_codecsmodule.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 44 additions & 20 deletions Objects/unicodeobject.c
Expand Up @@ -6308,8 +6308,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *message;

#define WRITE_ASCII_CHAR(ch) \
Expand All @@ -6336,7 +6334,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
continue;
}

startinpos = s - starts - 1;
Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
message = "\\ at end of string";
Expand Down Expand Up @@ -6483,8 +6481,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
*consumed = startinpos;
break;
}
error:
endinpos = s-starts;
error:;
Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
Expand Down Expand Up @@ -6679,9 +6677,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
/* --- Raw Unicode Escape Codec ------------------------------------------- */

PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
const char *starts = s;
_PyUnicodeWriter writer;
Expand All @@ -6690,6 +6689,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *exc = NULL;

if (size == 0) {
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY();
}

Expand All @@ -6708,8 +6710,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *message;

#define WRITE_CHAR(ch) \
Expand All @@ -6724,11 +6724,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
} while(0)

/* Non-escape characters are interpreted as Unicode ordinals */
if (c != '\\' || s >= end) {
if (c != '\\' || (s >= end && !consumed)) {
WRITE_CHAR(c);
continue;
}

Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
assert(consumed);
// Set message to silent compiler warning.
// Actually it is never used.
message = "\\ at end of string";
goto incomplete;
}

c = (unsigned char) *s++;
if (c == 'u') {
count = 4;
Expand All @@ -6744,10 +6754,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
WRITE_CHAR(c);
continue;
}
startinpos = s - starts - 2;

/* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
for (ch = 0; count && s < end; ++s, --count) {
for (ch = 0; count; ++s, --count) {
if (s >= end) {
goto incomplete;
}
c = (unsigned char)*s;
ch <<= 4;
if (c >= '0' && c <= '9') {
Expand All @@ -6760,18 +6772,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
ch += c - ('A' - 10);
}
else {
break;
goto error;
}
}
if (!count) {
if (ch <= MAX_UNICODE) {
WRITE_CHAR(ch);
continue;
}
if (ch > MAX_UNICODE) {
message = "\\Uxxxxxxxx out of range";
goto error;
}
WRITE_CHAR(ch);
continue;

endinpos = s-starts;
incomplete:
if (consumed) {
*consumed = startinpos;
break;
}
error:;
Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
Expand All @@ -6793,7 +6810,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
}

PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
{
return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
}


Expand Down

0 comments on commit 6848602

Please sign in to comment.