Skip to content

Commit

Permalink
Fix unicode_decode_utf8() perf regression
Browse files Browse the repository at this point in the history
  • Loading branch information
vstinner committed May 22, 2024
1 parent 33b1c4a commit 99f4b13
Showing 1 changed file with 75 additions and 33 deletions.
108 changes: 75 additions & 33 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -4750,35 +4750,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)


static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
const char *starts, const char *s, const char *end,
_Py_error_handler error_handler,
const char *errors,
Py_ssize_t *consumed)
{
const char *starts = s;
const char *end = s + size;

// fast path: try ASCII string.
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
return -1;
}

Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
if (writer->kind == PyUnicode_1BYTE_KIND
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
{
Py_ssize_t decoded = ascii_decode(s, end, dest);
writer->pos += decoded;

if (decoded == size) {
if (consumed) {
*consumed = size;
}
return 0;
}
s += decoded;
}

Py_ssize_t startinpos, endinpos;
const char *errmsg = "";
PyObject *error_handler_obj = NULL;
Expand Down Expand Up @@ -4828,6 +4805,8 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
endinpos = startinpos + ch - 1;
break;
default:
// ch doesn't fit into kind, so change the buffer kind to write
// the character
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
goto onError;
continue;
Expand Down Expand Up @@ -4899,8 +4878,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed)
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY();
}

Expand All @@ -4912,19 +4892,81 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
return get_latin1_char((unsigned char)s[0]);
}

// fast path: try ASCII string.
const char *starts = s;
const char *end = s + size;
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (decoded == size) {
if (consumed) {
*consumed = size;
}
return u;
}
s += decoded;
size -= decoded;

// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_Init(&writer);
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = decoded;

if (unicode_decode_utf8_writer(&writer, s, size,
error_handler, errors,
consumed) < 0) {
if (unicode_decode_utf8_impl(&writer, starts, s, end,
error_handler, errors,
consumed) < 0) {
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
}
return _PyUnicodeWriter_Finish(&writer);
}


static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed) {
*consumed = 0;
}
return 0;
}

// fast path: try ASCII string.
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
return -1;
}

const char *starts = s;
const char *end = s + size;
Py_ssize_t decoded = 0;
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
if (writer->kind == PyUnicode_1BYTE_KIND
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
{
decoded = ascii_decode(s, end, dest);
writer->pos += decoded;

if (decoded == size) {
if (consumed) {
*consumed = size;
}
return 0;
}
s += decoded;
size -= decoded;
}

return unicode_decode_utf8_impl(writer, starts, s, end,
error_handler, errors, consumed);
}


PyObject *
PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size,
Expand Down

0 comments on commit 99f4b13

Please sign in to comment.