Skip to content

Commit e9c538d

Browse files
authored
gh-139156: Optimize _PyUnicode_EncodeCharmap() (#139306)
Specialize _PyUnicode_EncodeCharmap() for EncodingMapType which is used by Python codecs such as iso8859_15.
1 parent 1963e70 commit e9c538d

File tree

1 file changed

+61
-14
lines changed

1 file changed

+61
-14
lines changed

Objects/unicodeobject.c

Lines changed: 61 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6435,6 +6435,8 @@ _PyUnicode_EncodeUTF16(PyObject *str,
64356435
#endif
64366436

64376437
if (kind == PyUnicode_1BYTE_KIND) {
6438+
// gh-139156: Don't use PyBytesWriter API here since it has an overhead
6439+
// on short strings
64386440
PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
64396441
if (v == NULL) {
64406442
return NULL;
@@ -8852,11 +8854,15 @@ charmapencode_output(Py_UCS4 c, PyObject *mapping,
88528854
if (Py_IS_TYPE(mapping, &EncodingMapType)) {
88538855
int res = encoding_map_lookup(c, mapping);
88548856
Py_ssize_t requiredsize = *outpos+1;
8855-
if (res == -1)
8857+
if (res == -1) {
88568858
return enc_FAILED;
8857-
if (outsize<requiredsize)
8858-
if (charmapencode_resize(writer, outpos, requiredsize))
8859+
}
8860+
8861+
if (outsize<requiredsize) {
8862+
if (charmapencode_resize(writer, outpos, requiredsize)) {
88598863
return enc_EXCEPTION;
8864+
}
8865+
}
88608866
outstart = _PyBytesWriter_GetData(writer);
88618867
outstart[(*outpos)++] = (char)res;
88628868
return enc_SUCCESS;
@@ -8897,7 +8903,7 @@ charmapencode_output(Py_UCS4 c, PyObject *mapping,
88978903
return enc_SUCCESS;
88988904
}
88998905

8900-
/* handle an error in PyUnicode_EncodeCharmap
8906+
/* handle an error in _PyUnicode_EncodeCharmap()
89018907
Return 0 on success, -1 on error */
89028908
static int
89038909
charmap_encoding_error(
@@ -9075,23 +9081,64 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
90759081
Py_ssize_t respos = 0;
90769082
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
90779083

9078-
while (inpos<size) {
9079-
Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9080-
/* try to encode it */
9081-
charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
9082-
if (x==enc_EXCEPTION) /* error */
9083-
goto onError;
9084-
if (x==enc_FAILED) { /* unencodable character */
9084+
if (Py_IS_TYPE(mapping, &EncodingMapType)) {
9085+
char *outstart = _PyBytesWriter_GetData(writer);
9086+
Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
9087+
9088+
while (inpos<size) {
9089+
Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9090+
9091+
/* try to encode it */
9092+
int res = encoding_map_lookup(ch, mapping);
9093+
Py_ssize_t requiredsize = respos+1;
9094+
if (res == -1) {
9095+
goto enc_FAILED;
9096+
}
9097+
9098+
if (outsize<requiredsize) {
9099+
if (charmapencode_resize(writer, &respos, requiredsize)) {
9100+
goto onError;
9101+
}
9102+
outstart = _PyBytesWriter_GetData(writer);
9103+
outsize = _PyBytesWriter_GetSize(writer);
9104+
}
9105+
outstart[respos++] = (char)res;
9106+
9107+
/* done with this character => adjust input position */
9108+
++inpos;
9109+
continue;
9110+
9111+
enc_FAILED:
90859112
if (charmap_encoding_error(unicode, &inpos, mapping,
90869113
&exc,
90879114
&error_handler, &error_handler_obj, errors,
90889115
writer, &respos)) {
90899116
goto onError;
90909117
}
9118+
outstart = _PyBytesWriter_GetData(writer);
9119+
outsize = _PyBytesWriter_GetSize(writer);
90919120
}
9092-
else {
9093-
/* done with this character => adjust input position */
9094-
++inpos;
9121+
}
9122+
else {
9123+
while (inpos<size) {
9124+
Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9125+
/* try to encode it */
9126+
charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
9127+
if (x==enc_EXCEPTION) { /* error */
9128+
goto onError;
9129+
}
9130+
if (x==enc_FAILED) { /* unencodable character */
9131+
if (charmap_encoding_error(unicode, &inpos, mapping,
9132+
&exc,
9133+
&error_handler, &error_handler_obj, errors,
9134+
writer, &respos)) {
9135+
goto onError;
9136+
}
9137+
}
9138+
else {
9139+
/* done with this character => adjust input position */
9140+
++inpos;
9141+
}
90959142
}
90969143
}
90979144

0 commit comments

Comments
 (0)