diff --git a/Makefile.pre.in b/Makefile.pre.in index 9e99c95e2af042..a80d9334ba5134 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1841,6 +1841,7 @@ UNICODE_DEPS = \ $(srcdir)/Objects/stringlib/localeutil.h \ $(srcdir)/Objects/stringlib/partition.h \ $(srcdir)/Objects/stringlib/replace.h \ + $(srcdir)/Objects/stringlib/repr.h \ $(srcdir)/Objects/stringlib/split.h \ $(srcdir)/Objects/stringlib/ucs1lib.h \ $(srcdir)/Objects/stringlib/ucs2lib.h \ diff --git a/Objects/stringlib/repr.h b/Objects/stringlib/repr.h new file mode 100644 index 00000000000000..87b1a8ba629dc6 --- /dev/null +++ b/Objects/stringlib/repr.h @@ -0,0 +1,95 @@ +/* stringlib: repr() implementation */ + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + + +static void +STRINGLIB(repr)(PyObject *unicode, Py_UCS4 quote, + STRINGLIB_CHAR *odata) +{ + Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode); + const void *idata = PyUnicode_DATA(unicode); + int ikind = PyUnicode_KIND(unicode); + + *odata++ = quote; + for (Py_ssize_t i = 0; i < isize; i++) { + Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); + + /* Escape quotes and backslashes */ + if ((ch == quote) || (ch == '\\')) { + *odata++ = '\\'; + *odata++ = ch; + continue; + } + + /* Map special whitespace to '\t', \n', '\r' */ + if (ch == '\t') { + *odata++ = '\\'; + *odata++ = 't'; + } + else if (ch == '\n') { + *odata++ = '\\'; + *odata++ = 'n'; + } + else if (ch == '\r') { + *odata++ = '\\'; + *odata++ = 'r'; + } + + /* Map non-printable US ASCII to '\xhh' */ + else if (ch < ' ' || ch == 0x7F) { + *odata++ = '\\'; + *odata++ = 'x'; + *odata++ = Py_hexdigits[(ch >> 4) & 0x000F]; + *odata++ = Py_hexdigits[ch & 0x000F]; + } + + /* Copy ASCII characters as-is */ + else if (ch < 0x7F) { + *odata++ = ch; + } + + /* Non-ASCII characters */ + else { + /* Map Unicode whitespace and control characters + (categories Z* and C* except ASCII space) + */ + if (!Py_UNICODE_ISPRINTABLE(ch)) { + *odata++ = '\\'; + /* Map 8-bit characters to '\xhh' */ + if (ch <= 0xff) { + *odata++ = 'x'; + *odata++ = Py_hexdigits[(ch >> 4) & 0x000F]; + *odata++ = Py_hexdigits[ch & 0x000F]; + } + /* Map 16-bit characters to '\uxxxx' */ + else if (ch <= 0xffff) { + *odata++ = 'u'; + *odata++ = Py_hexdigits[(ch >> 12) & 0xF]; + *odata++ = Py_hexdigits[(ch >> 8) & 0xF]; + *odata++ = Py_hexdigits[(ch >> 4) & 0xF]; + *odata++ = Py_hexdigits[ch & 0xF]; + } + /* Map 21-bit characters to '\U00xxxxxx' */ + else { + *odata++ = 'U'; + *odata++ = Py_hexdigits[(ch >> 28) & 0xF]; + *odata++ = Py_hexdigits[(ch >> 24) & 0xF]; + *odata++ = Py_hexdigits[(ch >> 20) & 0xF]; + *odata++ = Py_hexdigits[(ch >> 16) & 0xF]; + *odata++ = Py_hexdigits[(ch >> 12) & 0xF]; + *odata++ = Py_hexdigits[(ch >> 8) & 0xF]; + *odata++ = Py_hexdigits[(ch >> 4) & 0xF]; + *odata++ = Py_hexdigits[ch & 0xF]; + } + } + /* Copy characters as-is */ + else { + *odata++ = ch; + } + } + } + *odata = quote; +} diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 92db31f1e498f9..eb37b478cc4de1 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -899,6 +899,7 @@ ensure_unicode(PyObject *obj) #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/replace.h" +#include "stringlib/repr.h" #include "stringlib/find_max_char.h" #include "stringlib/undef.h" @@ -909,6 +910,7 @@ ensure_unicode(PyObject *obj) #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/replace.h" +#include "stringlib/repr.h" #include "stringlib/find_max_char.h" #include "stringlib/undef.h" @@ -919,6 +921,7 @@ ensure_unicode(PyObject *obj) #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/replace.h" +#include "stringlib/repr.h" #include "stringlib/find_max_char.h" #include "stringlib/undef.h" @@ -12336,24 +12339,17 @@ unicode_removesuffix_impl(PyObject *self, PyObject *suffix) static PyObject * unicode_repr(PyObject *unicode) { - PyObject *repr; - Py_ssize_t isize; - Py_ssize_t osize, squote, dquote, i, o; - Py_UCS4 max, quote; - int ikind, okind, unchanged; - const void *idata; - void *odata; - - isize = PyUnicode_GET_LENGTH(unicode); - idata = PyUnicode_DATA(unicode); + Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode); + const void *idata = PyUnicode_DATA(unicode); /* Compute length of output, quote characters, and maximum character */ - osize = 0; - max = 127; - squote = dquote = 0; - ikind = PyUnicode_KIND(unicode); - for (i = 0; i < isize; i++) { + Py_ssize_t osize = 0; + Py_UCS4 maxch = 127; + Py_ssize_t squote = 0; + Py_ssize_t dquote = 0; + int ikind = PyUnicode_KIND(unicode); + for (Py_ssize_t i = 0; i < isize; i++) { Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); Py_ssize_t incr = 1; switch (ch) { @@ -12369,7 +12365,7 @@ unicode_repr(PyObject *unicode) else if (ch < 0x7f) ; else if (Py_UNICODE_ISPRINTABLE(ch)) - max = ch > max ? ch : max; + maxch = (ch > maxch) ? ch : maxch; else if (ch < 0x100) incr = 4; /* \xHH */ else if (ch < 0x10000) @@ -12385,10 +12381,10 @@ unicode_repr(PyObject *unicode) osize += incr; } - quote = '\''; - unchanged = (osize == isize); + Py_UCS4 quote = '\''; + int changed = (osize != isize); if (squote) { - unchanged = 0; + changed = 1; if (dquote) /* Both squote and dquote present. Use squote, and escape them */ @@ -12398,99 +12394,35 @@ unicode_repr(PyObject *unicode) } osize += 2; /* quotes */ - repr = PyUnicode_New(osize, max); + PyObject *repr = PyUnicode_New(osize, maxch); if (repr == NULL) return NULL; - okind = PyUnicode_KIND(repr); - odata = PyUnicode_DATA(repr); + int okind = PyUnicode_KIND(repr); + void *odata = PyUnicode_DATA(repr); + + if (!changed) { + PyUnicode_WRITE(okind, odata, 0, quote); - PyUnicode_WRITE(okind, odata, 0, quote); - PyUnicode_WRITE(okind, odata, osize-1, quote); - if (unchanged) { _PyUnicode_FastCopyCharacters(repr, 1, unicode, 0, isize); + + PyUnicode_WRITE(okind, odata, osize-1, quote); } else { - for (i = 0, o = 1; i < isize; i++) { - Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); - - /* Escape quotes and backslashes */ - if ((ch == quote) || (ch == '\\')) { - PyUnicode_WRITE(okind, odata, o++, '\\'); - PyUnicode_WRITE(okind, odata, o++, ch); - continue; - } - - /* Map special whitespace to '\t', \n', '\r' */ - if (ch == '\t') { - PyUnicode_WRITE(okind, odata, o++, '\\'); - PyUnicode_WRITE(okind, odata, o++, 't'); - } - else if (ch == '\n') { - PyUnicode_WRITE(okind, odata, o++, '\\'); - PyUnicode_WRITE(okind, odata, o++, 'n'); - } - else if (ch == '\r') { - PyUnicode_WRITE(okind, odata, o++, '\\'); - PyUnicode_WRITE(okind, odata, o++, 'r'); - } - - /* Map non-printable US ASCII to '\xhh' */ - else if (ch < ' ' || ch == 0x7F) { - PyUnicode_WRITE(okind, odata, o++, '\\'); - PyUnicode_WRITE(okind, odata, o++, 'x'); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); - } - - /* Copy ASCII characters as-is */ - else if (ch < 0x7F) { - PyUnicode_WRITE(okind, odata, o++, ch); - } - - /* Non-ASCII characters */ - else { - /* Map Unicode whitespace and control characters - (categories Z* and C* except ASCII space) - */ - if (!Py_UNICODE_ISPRINTABLE(ch)) { - PyUnicode_WRITE(okind, odata, o++, '\\'); - /* Map 8-bit characters to '\xhh' */ - if (ch <= 0xff) { - PyUnicode_WRITE(okind, odata, o++, 'x'); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); - } - /* Map 16-bit characters to '\uxxxx' */ - else if (ch <= 0xffff) { - PyUnicode_WRITE(okind, odata, o++, 'u'); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); - } - /* Map 21-bit characters to '\U00xxxxxx' */ - else { - PyUnicode_WRITE(okind, odata, o++, 'U'); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); - PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); - } - } - /* Copy characters as-is */ - else { - PyUnicode_WRITE(okind, odata, o++, ch); - } - } + switch (okind) { + case PyUnicode_1BYTE_KIND: + ucs1lib_repr(unicode, quote, odata); + break; + case PyUnicode_2BYTE_KIND: + ucs2lib_repr(unicode, quote, odata); + break; + default: + assert(okind == PyUnicode_4BYTE_KIND); + ucs4lib_repr(unicode, quote, odata); } } - /* Closing quote already added at the beginning */ + assert(_PyUnicode_CheckConsistency(repr, 1)); return repr; } diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py index 12010f0e9c0549..4623f2c8d671bd 100644 --- a/Tools/c-analyzer/cpython/_parser.py +++ b/Tools/c-analyzer/cpython/_parser.py @@ -167,6 +167,7 @@ def clean_lines(text): Objects/stringlib/find.h Objects/stringlib/fastsearch.h Objects/stringlib/partition.h Objects/stringlib/fastsearch.h Objects/stringlib/replace.h Objects/stringlib/fastsearch.h +Objects/stringlib/repr.h Objects/stringlib/fastsearch.h Objects/stringlib/split.h Objects/stringlib/fastsearch.h # @end=tsv@