diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index f1c9bcd47888b1..b83039c1869f23 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -11,10 +11,14 @@ extern "C" { #include "pycore_fileutils.h" // _Py_error_handler #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI + // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111). #define _Py_MAX_UNICODE 0x10ffff +extern int _PyUnicode_IsModifiable(PyObject *unicode); + + static inline void _PyUnicode_Fill(int kind, void *data, Py_UCS4 value, Py_ssize_t start, Py_ssize_t length) @@ -48,6 +52,28 @@ _PyUnicode_Fill(int kind, void *data, Py_UCS4 value, } } +static inline int +_PyUnicode_EnsureUnicode(PyObject *obj) +{ + if (!PyUnicode_Check(obj)) { + PyErr_Format(PyExc_TypeError, + "must be str, not %T", obj); + return -1; + } + return 0; +} + +static inline int +_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) +{ + assert(ch <= _Py_MAX_UNICODE); + if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) + return -1; + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); + writer->pos++; + return 0; +} + /* --- Characters Type APIs ----------------------------------------------- */ diff --git a/Makefile.pre.in b/Makefile.pre.in index a5223246845dcf..19423c11545c19 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -557,9 +557,10 @@ OBJECT_OBJS= \ Objects/tupleobject.o \ Objects/typeobject.o \ Objects/typevarobject.o \ + Objects/unicode_format.o \ Objects/unicode_formatter.o \ - Objects/unicodeobject.o \ Objects/unicodectype.o \ + Objects/unicodeobject.o \ Objects/unionobject.o \ Objects/weakrefobject.o \ @PERF_TRAMPOLINE_OBJ@ @@ -2105,6 +2106,7 @@ Objects/bytes_methods.o: $(srcdir)/Objects/bytes_methods.c $(BYTESTR_DEPS) Objects/bytesobject.o: $(srcdir)/Objects/bytesobject.c $(BYTESTR_DEPS) Objects/bytearrayobject.o: $(srcdir)/Objects/bytearrayobject.c $(BYTESTR_DEPS) +Objects/unicode_format.o: $(srcdir)/Objects/unicode_format.c $(UNICODE_DEPS) Objects/unicodeobject.o: $(srcdir)/Objects/unicodeobject.c $(UNICODE_DEPS) Objects/dictobject.o: $(srcdir)/Objects/stringlib/eq.h diff --git a/Objects/unicode_format.c b/Objects/unicode_format.c new file mode 100644 index 00000000000000..26bdae55d8b931 --- /dev/null +++ b/Objects/unicode_format.c @@ -0,0 +1,1002 @@ +/* + +Unicode implementation based on original code by Fredrik Lundh, +modified by Marc-Andre Lemburg . + +Major speed upgrades to the method implementations at the Reykjavik +NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. + +Copyright (c) Corporation for National Research Initiatives. + +-------------------------------------------------------------------- +The original string type implementation is: + + Copyright (c) 1999 by Secret Labs AB + Copyright (c) 1999 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its +associated documentation, you agree that you have read, understood, +and will comply with the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its +associated documentation for any purpose and without fee is hereby +granted, provided that the above copyright notice appears in all +copies, and that both that copyright notice and this permission notice +appear in supporting documentation, and that the name of Secret Labs +AB or the author not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR +ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +-------------------------------------------------------------------- + +*/ + +// PyUnicode_Format() implementation + +#include "Python.h" +#include "pycore_abstract.h" // _PyIndex_Check() +#include "pycore_format.h" // F_ALT +#include "pycore_long.h" // _PyLong_FormatWriter() +#include "pycore_object.h" // _PyObject_IsUniquelyReferenced() +#include "pycore_unicodeobject.h" // _Py_MAX_UNICODE + + +#define MAX_UNICODE _Py_MAX_UNICODE +#define ensure_unicode _PyUnicode_EnsureUnicode + +struct unicode_formatter_t { + PyObject *args; + int args_owned; + Py_ssize_t arglen, argidx; + PyObject *dict; + + int fmtkind; + Py_ssize_t fmtcnt, fmtpos; + const void *fmtdata; + PyObject *fmtstr; + + _PyUnicodeWriter writer; +}; + + +struct unicode_format_arg_t { + Py_UCS4 ch; + int flags; + Py_ssize_t width; + int prec; + int sign; +}; + + +static PyObject * +unicode_format_getnextarg(struct unicode_formatter_t *ctx) +{ + Py_ssize_t argidx = ctx->argidx; + + if (argidx < ctx->arglen) { + ctx->argidx++; + if (ctx->arglen < 0) + return ctx->args; + else + return PyTuple_GetItem(ctx->args, argidx); + } + PyErr_SetString(PyExc_TypeError, + "not enough arguments for format string"); + return NULL; +} + + +/* Returns a new reference to a PyUnicode object, or NULL on failure. */ + +/* Format a float into the writer if the writer is not NULL, or into *p_output + otherwise. + + Return 0 on success, raise an exception and return -1 on error. */ +static int +formatfloat(PyObject *v, struct unicode_format_arg_t *arg, + PyObject **p_output, + _PyUnicodeWriter *writer) +{ + char *p; + double x; + Py_ssize_t len; + int prec; + int dtoa_flags = 0; + + x = PyFloat_AsDouble(v); + if (x == -1.0 && PyErr_Occurred()) + return -1; + + prec = arg->prec; + if (prec < 0) + prec = 6; + + if (arg->flags & F_ALT) + dtoa_flags |= Py_DTSF_ALT; + p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); + if (p == NULL) + return -1; + len = strlen(p); + if (writer) { + if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { + PyMem_Free(p); + return -1; + } + } + else + *p_output = _PyUnicode_FromASCII(p, len); + PyMem_Free(p); + return 0; +} + + +/* formatlong() emulates the format codes d, u, o, x and X, and + * the F_ALT flag, for Python's long (unbounded) ints. It's not used for + * Python's regular ints. + * Return value: a new PyUnicodeObject*, or NULL if error. + * The output string is of the form + * "-"? ("0x" | "0X")? digit+ + * "0x"/"0X" are present only for x and X conversions, with F_ALT + * set in flags. The case of hex digits will be correct, + * There will be at least prec digits, zero-filled on the left if + * necessary to get that many. + * val object to be converted + * flags bitmask of format flags; only F_ALT is looked at + * prec minimum number of digits; 0-fill on left if needed + * type a character in [duoxX]; u acts the same as d + * + * CAUTION: o, x and X conversions on regular ints can never + * produce a '-' sign, but can for Python's unbounded ints. + */ +PyObject * +_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) +{ + PyObject *result = NULL; + char *buf; + Py_ssize_t i; + int sign; /* 1 if '-', else 0 */ + int len; /* number of characters */ + Py_ssize_t llen; + int numdigits; /* len == numnondigits + numdigits */ + int numnondigits = 0; + + /* Avoid exceeding SSIZE_T_MAX */ + if (prec > INT_MAX-3) { + PyErr_SetString(PyExc_OverflowError, + "precision too large"); + return NULL; + } + + assert(PyLong_Check(val)); + + switch (type) { + default: + Py_UNREACHABLE(); + case 'd': + case 'i': + case 'u': + /* int and int subclasses should print numerically when a numeric */ + /* format code is used (see issue18780) */ + result = PyNumber_ToBase(val, 10); + break; + case 'o': + numnondigits = 2; + result = PyNumber_ToBase(val, 8); + break; + case 'x': + case 'X': + numnondigits = 2; + result = PyNumber_ToBase(val, 16); + break; + } + if (!result) + return NULL; + + assert(_PyUnicode_IsModifiable(result)); + assert(PyUnicode_IS_ASCII(result)); + + /* To modify the string in-place, there can only be one reference. */ + if (!_PyObject_IsUniquelyReferenced(result)) { + Py_DECREF(result); + PyErr_BadInternalCall(); + return NULL; + } + buf = PyUnicode_DATA(result); + llen = PyUnicode_GET_LENGTH(result); + if (llen > INT_MAX) { + Py_DECREF(result); + PyErr_SetString(PyExc_ValueError, + "string too large in _PyUnicode_FormatLong"); + return NULL; + } + len = (int)llen; + sign = buf[0] == '-'; + numnondigits += sign; + numdigits = len - numnondigits; + assert(numdigits > 0); + + /* Get rid of base marker unless F_ALT */ + if (((alt) == 0 && + (type == 'o' || type == 'x' || type == 'X'))) { + assert(buf[sign] == '0'); + assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || + buf[sign+1] == 'o'); + numnondigits -= 2; + buf += 2; + len -= 2; + if (sign) + buf[0] = '-'; + assert(len == numnondigits + numdigits); + assert(numdigits > 0); + } + + /* Fill with leading zeroes to meet minimum width. */ + if (prec > numdigits) { + PyObject *r1 = PyBytes_FromStringAndSize(NULL, + numnondigits + prec); + char *b1; + if (!r1) { + Py_DECREF(result); + return NULL; + } + b1 = PyBytes_AS_STRING(r1); + for (i = 0; i < numnondigits; ++i) + *b1++ = *buf++; + for (i = 0; i < prec - numdigits; i++) + *b1++ = '0'; + for (i = 0; i < numdigits; i++) + *b1++ = *buf++; + *b1 = '\0'; + Py_SETREF(result, r1); + buf = PyBytes_AS_STRING(result); + len = numnondigits + prec; + } + + /* Fix up case for hex conversions. */ + if (type == 'X') { + /* Need to convert all lower case letters to upper case. + and need to convert 0x to 0X (and -0x to -0X). */ + for (i = 0; i < len; i++) + if (buf[i] >= 'a' && buf[i] <= 'x') + buf[i] -= 'a'-'A'; + } + if (!PyUnicode_Check(result) + || buf != PyUnicode_DATA(result)) { + PyObject *unicode; + unicode = _PyUnicode_FromASCII(buf, len); + Py_SETREF(result, unicode); + } + else if (len != PyUnicode_GET_LENGTH(result)) { + if (PyUnicode_Resize(&result, len) < 0) + Py_CLEAR(result); + } + return result; +} + + +/* Format an integer or a float as an integer. + * Return 1 if the number has been formatted into the writer, + * 0 if the number has been formatted into *p_output + * -1 and raise an exception on error */ +static int +mainformatlong(PyObject *v, + struct unicode_format_arg_t *arg, + PyObject **p_output, + _PyUnicodeWriter *writer) +{ + PyObject *iobj, *res; + char type = (char)arg->ch; + + if (!PyNumber_Check(v)) + goto wrongtype; + + /* make sure number is a type of integer for o, x, and X */ + if (!PyLong_Check(v)) { + if (type == 'o' || type == 'x' || type == 'X') { + iobj = _PyNumber_Index(v); + } + else { + iobj = PyNumber_Long(v); + } + if (iobj == NULL ) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) + goto wrongtype; + return -1; + } + assert(PyLong_Check(iobj)); + } + else { + iobj = Py_NewRef(v); + } + + if (PyLong_CheckExact(v) + && arg->width == -1 && arg->prec == -1 + && !(arg->flags & (F_SIGN | F_BLANK)) + && type != 'X') + { + /* Fast path */ + int alternate = arg->flags & F_ALT; + int base; + + switch(type) + { + default: + Py_UNREACHABLE(); + case 'd': + case 'i': + case 'u': + base = 10; + break; + case 'o': + base = 8; + break; + case 'x': + case 'X': + base = 16; + break; + } + + if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { + Py_DECREF(iobj); + return -1; + } + Py_DECREF(iobj); + return 1; + } + + res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); + Py_DECREF(iobj); + if (res == NULL) + return -1; + *p_output = res; + return 0; + +wrongtype: + switch(type) + { + case 'o': + case 'x': + case 'X': + PyErr_Format(PyExc_TypeError, + "%%%c format: an integer is required, " + "not %.200s", + type, Py_TYPE(v)->tp_name); + break; + default: + PyErr_Format(PyExc_TypeError, + "%%%c format: a real number is required, " + "not %.200s", + type, Py_TYPE(v)->tp_name); + break; + } + return -1; +} + + +static Py_UCS4 +formatchar(PyObject *v) +{ + /* presume that the buffer is at least 3 characters long */ + if (PyUnicode_Check(v)) { + if (PyUnicode_GET_LENGTH(v) == 1) { + return PyUnicode_READ_CHAR(v, 0); + } + PyErr_Format(PyExc_TypeError, + "%%c requires an int or a unicode character, " + "not a string of length %zd", + PyUnicode_GET_LENGTH(v)); + return (Py_UCS4) -1; + } + else { + int overflow; + long x = PyLong_AsLongAndOverflow(v, &overflow); + if (x == -1 && PyErr_Occurred()) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Format(PyExc_TypeError, + "%%c requires an int or a unicode character, not %T", + v); + return (Py_UCS4) -1; + } + return (Py_UCS4) -1; + } + + if (x < 0 || x > MAX_UNICODE) { + /* this includes an overflow in converting to C long */ + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x110000)"); + return (Py_UCS4) -1; + } + + return (Py_UCS4) x; + } +} + + +/* Parse options of an argument: flags, width, precision. + Handle also "%(name)" syntax. + + Return 0 if the argument has been formatted into arg->str. + Return 1 if the argument has been written into ctx->writer, + Raise an exception and return -1 on error. */ +static int +unicode_format_arg_parse(struct unicode_formatter_t *ctx, + struct unicode_format_arg_t *arg) +{ +#define FORMAT_READ(ctx) \ + PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) + + PyObject *v; + + if (arg->ch == '(') { + /* Get argument value from a dictionary. Example: "%(name)s". */ + Py_ssize_t keystart; + Py_ssize_t keylen; + PyObject *key; + int pcount = 1; + + if (ctx->dict == NULL) { + PyErr_SetString(PyExc_TypeError, + "format requires a mapping"); + return -1; + } + ++ctx->fmtpos; + --ctx->fmtcnt; + keystart = ctx->fmtpos; + /* Skip over balanced parentheses */ + while (pcount > 0 && --ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + if (arg->ch == ')') + --pcount; + else if (arg->ch == '(') + ++pcount; + ctx->fmtpos++; + } + keylen = ctx->fmtpos - keystart - 1; + if (ctx->fmtcnt < 0 || pcount > 0) { + PyErr_SetString(PyExc_ValueError, + "incomplete format key"); + return -1; + } + key = PyUnicode_Substring(ctx->fmtstr, + keystart, keystart + keylen); + if (key == NULL) + return -1; + if (ctx->args_owned) { + ctx->args_owned = 0; + Py_DECREF(ctx->args); + } + ctx->args = PyObject_GetItem(ctx->dict, key); + Py_DECREF(key); + if (ctx->args == NULL) + return -1; + ctx->args_owned = 1; + ctx->arglen = -1; + ctx->argidx = -2; + } + + /* Parse flags. Example: "%+i" => flags=F_SIGN. */ + while (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + switch (arg->ch) { + case '-': arg->flags |= F_LJUST; continue; + case '+': arg->flags |= F_SIGN; continue; + case ' ': arg->flags |= F_BLANK; continue; + case '#': arg->flags |= F_ALT; continue; + case '0': arg->flags |= F_ZERO; continue; + } + break; + } + + /* Parse width. Example: "%10s" => width=10 */ + if (arg->ch == '*') { + v = unicode_format_getnextarg(ctx); + if (v == NULL) + return -1; + if (!PyLong_Check(v)) { + PyErr_SetString(PyExc_TypeError, + "* wants int"); + return -1; + } + arg->width = PyLong_AsSsize_t(v); + if (arg->width == -1 && PyErr_Occurred()) + return -1; + if (arg->width < 0) { + arg->flags |= F_LJUST; + arg->width = -arg->width; + } + if (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + } + } + else if (arg->ch >= '0' && arg->ch <= '9') { + arg->width = arg->ch - '0'; + while (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + if (arg->ch < '0' || arg->ch > '9') + break; + /* Since arg->ch is unsigned, the RHS would end up as unsigned, + mixing signed and unsigned comparison. Since arg->ch is between + '0' and '9', casting to int is safe. */ + if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "width too big"); + return -1; + } + arg->width = arg->width*10 + (arg->ch - '0'); + } + } + + /* Parse precision. Example: "%.3f" => prec=3 */ + if (arg->ch == '.') { + arg->prec = 0; + if (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + } + if (arg->ch == '*') { + v = unicode_format_getnextarg(ctx); + if (v == NULL) + return -1; + if (!PyLong_Check(v)) { + PyErr_SetString(PyExc_TypeError, + "* wants int"); + return -1; + } + arg->prec = PyLong_AsInt(v); + if (arg->prec == -1 && PyErr_Occurred()) + return -1; + if (arg->prec < 0) + arg->prec = 0; + if (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + } + } + else if (arg->ch >= '0' && arg->ch <= '9') { + arg->prec = arg->ch - '0'; + while (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + if (arg->ch < '0' || arg->ch > '9') + break; + if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "precision too big"); + return -1; + } + arg->prec = arg->prec*10 + (arg->ch - '0'); + } + } + } + + /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ + if (ctx->fmtcnt >= 0) { + if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { + if (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + } + } + } + if (ctx->fmtcnt < 0) { + PyErr_SetString(PyExc_ValueError, + "incomplete format"); + return -1; + } + return 0; + +#undef FORMAT_READ +} + + +/* Format one argument. Supported conversion specifiers: + + - "s", "r", "a": any type + - "i", "d", "u": int or float + - "o", "x", "X": int + - "e", "E", "f", "F", "g", "G": float + - "c": int or str (1 character) + + When possible, the output is written directly into the Unicode writer + (ctx->writer). A string is created when padding is required. + + Return 0 if the argument has been formatted into *p_str, + 1 if the argument has been written into ctx->writer, + -1 on error. */ +static int +unicode_format_arg_format(struct unicode_formatter_t *ctx, + struct unicode_format_arg_t *arg, + PyObject **p_str) +{ + PyObject *v; + _PyUnicodeWriter *writer = &ctx->writer; + + if (ctx->fmtcnt == 0) + ctx->writer.overallocate = 0; + + v = unicode_format_getnextarg(ctx); + if (v == NULL) + return -1; + + + switch (arg->ch) { + case 's': + case 'r': + case 'a': + if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { + /* Fast path */ + if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) + return -1; + return 1; + } + + if (PyUnicode_CheckExact(v) && arg->ch == 's') { + *p_str = Py_NewRef(v); + } + else { + if (arg->ch == 's') + *p_str = PyObject_Str(v); + else if (arg->ch == 'r') + *p_str = PyObject_Repr(v); + else + *p_str = PyObject_ASCII(v); + } + break; + + case 'i': + case 'd': + case 'u': + case 'o': + case 'x': + case 'X': + { + int ret = mainformatlong(v, arg, p_str, writer); + if (ret != 0) + return ret; + arg->sign = 1; + break; + } + + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + if (arg->width == -1 && arg->prec == -1 + && !(arg->flags & (F_SIGN | F_BLANK))) + { + /* Fast path */ + if (formatfloat(v, arg, NULL, writer) == -1) + return -1; + return 1; + } + + arg->sign = 1; + if (formatfloat(v, arg, p_str, NULL) == -1) + return -1; + break; + + case 'c': + { + Py_UCS4 ch = formatchar(v); + if (ch == (Py_UCS4) -1) + return -1; + if (arg->width == -1 && arg->prec == -1) { + /* Fast path */ + if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) + return -1; + return 1; + } + *p_str = PyUnicode_FromOrdinal(ch); + break; + } + + default: + PyErr_Format(PyExc_ValueError, + "unsupported format character '%c' (0x%x) " + "at index %zd", + (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', + (int)arg->ch, + ctx->fmtpos - 1); + return -1; + } + if (*p_str == NULL) + return -1; + assert (PyUnicode_Check(*p_str)); + return 0; +} + + +static int +unicode_format_arg_output(struct unicode_formatter_t *ctx, + struct unicode_format_arg_t *arg, + PyObject *str) +{ + Py_ssize_t len; + int kind; + const void *pbuf; + Py_ssize_t pindex; + Py_UCS4 signchar; + Py_ssize_t buflen; + Py_UCS4 maxchar; + Py_ssize_t sublen; + _PyUnicodeWriter *writer = &ctx->writer; + Py_UCS4 fill; + + fill = ' '; + if (arg->sign && arg->flags & F_ZERO) + fill = '0'; + + len = PyUnicode_GET_LENGTH(str); + if ((arg->width == -1 || arg->width <= len) + && (arg->prec == -1 || arg->prec >= len) + && !(arg->flags & (F_SIGN | F_BLANK))) + { + /* Fast path */ + if (_PyUnicodeWriter_WriteStr(writer, str) == -1) + return -1; + return 0; + } + + /* Truncate the string for "s", "r" and "a" formats + if the precision is set */ + if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { + if (arg->prec >= 0 && len > arg->prec) + len = arg->prec; + } + + /* Adjust sign and width */ + kind = PyUnicode_KIND(str); + pbuf = PyUnicode_DATA(str); + pindex = 0; + signchar = '\0'; + if (arg->sign) { + Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); + if (ch == '-' || ch == '+') { + signchar = ch; + len--; + pindex++; + } + else if (arg->flags & F_SIGN) + signchar = '+'; + else if (arg->flags & F_BLANK) + signchar = ' '; + else + arg->sign = 0; + } + if (arg->width < len) + arg->width = len; + + /* Prepare the writer */ + maxchar = writer->maxchar; + if (!(arg->flags & F_LJUST)) { + if (arg->sign) { + if ((arg->width-1) > len) + maxchar = Py_MAX(maxchar, fill); + } + else { + if (arg->width > len) + maxchar = Py_MAX(maxchar, fill); + } + } + if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { + Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); + maxchar = Py_MAX(maxchar, strmaxchar); + } + + buflen = arg->width; + if (arg->sign && len == arg->width) + buflen++; + if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) + return -1; + + /* Write the sign if needed */ + if (arg->sign) { + if (fill != ' ') { + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); + writer->pos += 1; + } + if (arg->width > len) + arg->width--; + } + + /* Write the numeric prefix for "x", "X" and "o" formats + if the alternate form is used. + For example, write "0x" for the "%#x" format. */ + if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { + assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); + assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); + if (fill != ' ') { + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); + PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); + writer->pos += 2; + pindex += 2; + } + arg->width -= 2; + if (arg->width < 0) + arg->width = 0; + len -= 2; + } + + /* Pad left with the fill character if needed */ + if (arg->width > len && !(arg->flags & F_LJUST)) { + sublen = arg->width - len; + _PyUnicode_Fill(writer->kind, writer->data, fill, writer->pos, sublen); + writer->pos += sublen; + arg->width = len; + } + + /* If padding with spaces: write sign if needed and/or numeric prefix if + the alternate form is used */ + if (fill == ' ') { + if (arg->sign) { + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); + writer->pos += 1; + } + if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { + assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); + assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); + PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); + writer->pos += 2; + pindex += 2; + } + } + + /* Write characters */ + if (len) { + _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, + str, pindex, len); + writer->pos += len; + } + + /* Pad right with the fill character if needed */ + if (arg->width > len) { + sublen = arg->width - len; + _PyUnicode_Fill(writer->kind, writer->data, ' ', writer->pos, sublen); + writer->pos += sublen; + } + return 0; +} + + +/* Helper of PyUnicode_Format(): format one arg. + Return 0 on success, raise an exception and return -1 on error. */ +static int +unicode_format_arg(struct unicode_formatter_t *ctx) +{ + struct unicode_format_arg_t arg; + PyObject *str; + int ret; + + arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); + if (arg.ch == '%') { + ctx->fmtpos++; + ctx->fmtcnt--; + if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0) + return -1; + return 0; + } + arg.flags = 0; + arg.width = -1; + arg.prec = -1; + arg.sign = 0; + str = NULL; + + ret = unicode_format_arg_parse(ctx, &arg); + if (ret == -1) + return -1; + + ret = unicode_format_arg_format(ctx, &arg, &str); + if (ret == -1) + return -1; + + if (ret != 1) { + ret = unicode_format_arg_output(ctx, &arg, str); + Py_DECREF(str); + if (ret == -1) + return -1; + } + + if (ctx->dict && (ctx->argidx < ctx->arglen)) { + PyErr_SetString(PyExc_TypeError, + "not all arguments converted during string formatting"); + return -1; + } + return 0; +} + + +PyObject * +PyUnicode_Format(PyObject *format, PyObject *args) +{ + struct unicode_formatter_t ctx; + + if (format == NULL || args == NULL) { + PyErr_BadInternalCall(); + return NULL; + } + + if (ensure_unicode(format) < 0) + return NULL; + + ctx.fmtstr = format; + ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); + ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); + ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); + ctx.fmtpos = 0; + + _PyUnicodeWriter_Init(&ctx.writer); + ctx.writer.min_length = ctx.fmtcnt + 100; + ctx.writer.overallocate = 1; + + if (PyTuple_Check(args)) { + ctx.arglen = PyTuple_Size(args); + ctx.argidx = 0; + } + else { + ctx.arglen = -1; + ctx.argidx = -2; + } + ctx.args_owned = 0; + if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) + ctx.dict = args; + else + ctx.dict = NULL; + ctx.args = args; + + while (--ctx.fmtcnt >= 0) { + if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { + Py_ssize_t nonfmtpos; + + nonfmtpos = ctx.fmtpos++; + while (ctx.fmtcnt >= 0 && + PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { + ctx.fmtpos++; + ctx.fmtcnt--; + } + if (ctx.fmtcnt < 0) { + ctx.fmtpos--; + ctx.writer.overallocate = 0; + } + + if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, + nonfmtpos, ctx.fmtpos) < 0) + goto onError; + } + else { + ctx.fmtpos++; + if (unicode_format_arg(&ctx) == -1) + goto onError; + } + } + + if (ctx.argidx < ctx.arglen && !ctx.dict) { + PyErr_SetString(PyExc_TypeError, + "not all arguments converted during string formatting"); + goto onError; + } + + if (ctx.args_owned) { + Py_DECREF(ctx.args); + } + return _PyUnicodeWriter_Finish(&ctx.writer); + + onError: + _PyUnicodeWriter_Dealloc(&ctx.writer); + if (ctx.args_owned) { + Py_DECREF(ctx.args); + } + return NULL; +} diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c71f9d3f71dea5..a67bf9b1c5337b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -105,6 +105,7 @@ NOTE: In the interpreter's initialization phase, some globals are currently */ #define MAX_UNICODE _Py_MAX_UNICODE +#define ensure_unicode _PyUnicode_EnsureUnicode #ifdef Py_DEBUG # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) @@ -452,7 +453,6 @@ const unsigned char _Py_ascii_whitespace[] = { /* forward */ static PyObject* get_latin1_char(unsigned char ch); -static int unicode_modifiable(PyObject *unicode); static PyObject * @@ -983,18 +983,6 @@ make_bloom_mask(int kind, const void* ptr, Py_ssize_t len) #undef BLOOM_UPDATE } -static int -ensure_unicode(PyObject *obj) -{ - if (!PyUnicode_Check(obj)) { - PyErr_Format(PyExc_TypeError, - "must be str, not %.100s", - Py_TYPE(obj)->tp_name); - return -1; - } - return 0; -} - /* Compilation of templated routines */ #define STRINGLIB_GET_EMPTY() unicode_get_empty() @@ -1120,7 +1108,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); #endif - if (!unicode_modifiable(unicode)) { + if (!_PyUnicode_IsModifiable(unicode)) { PyObject *copy = resize_copy(unicode, length); if (copy == NULL) { return NULL; @@ -1412,7 +1400,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) static int unicode_check_modifiable(PyObject *unicode) { - if (!unicode_modifiable(unicode)) { + if (!_PyUnicode_IsModifiable(unicode)) { PyErr_SetString(PyExc_SystemError, "Cannot modify a string currently used"); return -1; @@ -1774,8 +1762,8 @@ unicode_is_singleton(PyObject *unicode) } #endif -static int -unicode_modifiable(PyObject *unicode) +int +_PyUnicode_IsModifiable(PyObject *unicode) { assert(_PyUnicode_CHECK(unicode)); if (!_PyObject_IsUniquelyReferenced(unicode)) @@ -1816,7 +1804,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) return 0; } - if (!unicode_modifiable(unicode)) { + if (!_PyUnicode_IsModifiable(unicode)) { PyObject *copy = resize_copy(unicode, length); if (copy == NULL) return -1; @@ -10252,7 +10240,7 @@ _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, { const int kind = PyUnicode_KIND(unicode); void *data = PyUnicode_DATA(unicode); - assert(unicode_modifiable(unicode)); + assert(_PyUnicode_IsModifiable(unicode)); assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); assert(start >= 0); assert(start + length <= PyUnicode_GET_LENGTH(unicode)); @@ -11524,7 +11512,7 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) } new_len = left_len + right_len; - if (unicode_modifiable(left) + if (_PyUnicode_IsModifiable(left) && PyUnicode_CheckExact(right) && PyUnicode_KIND(right) <= PyUnicode_KIND(left) /* Don't resize for ascii += latin1. Convert ascii to latin1 requires @@ -13722,17 +13710,6 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); } -static inline int -_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) -{ - assert(ch <= MAX_UNICODE); - if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) - return -1; - PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); - writer->pos++; - return 0; -} - int _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) { @@ -14461,947 +14438,6 @@ static PyMappingMethods unicode_as_mapping = { }; -/* Helpers for PyUnicode_Format() */ - -struct unicode_formatter_t { - PyObject *args; - int args_owned; - Py_ssize_t arglen, argidx; - PyObject *dict; - - int fmtkind; - Py_ssize_t fmtcnt, fmtpos; - const void *fmtdata; - PyObject *fmtstr; - - _PyUnicodeWriter writer; -}; - -struct unicode_format_arg_t { - Py_UCS4 ch; - int flags; - Py_ssize_t width; - int prec; - int sign; -}; - -static PyObject * -unicode_format_getnextarg(struct unicode_formatter_t *ctx) -{ - Py_ssize_t argidx = ctx->argidx; - - if (argidx < ctx->arglen) { - ctx->argidx++; - if (ctx->arglen < 0) - return ctx->args; - else - return PyTuple_GetItem(ctx->args, argidx); - } - PyErr_SetString(PyExc_TypeError, - "not enough arguments for format string"); - return NULL; -} - -/* Returns a new reference to a PyUnicode object, or NULL on failure. */ - -/* Format a float into the writer if the writer is not NULL, or into *p_output - otherwise. - - Return 0 on success, raise an exception and return -1 on error. */ -static int -formatfloat(PyObject *v, struct unicode_format_arg_t *arg, - PyObject **p_output, - _PyUnicodeWriter *writer) -{ - char *p; - double x; - Py_ssize_t len; - int prec; - int dtoa_flags = 0; - - x = PyFloat_AsDouble(v); - if (x == -1.0 && PyErr_Occurred()) - return -1; - - prec = arg->prec; - if (prec < 0) - prec = 6; - - if (arg->flags & F_ALT) - dtoa_flags |= Py_DTSF_ALT; - p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); - if (p == NULL) - return -1; - len = strlen(p); - if (writer) { - if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { - PyMem_Free(p); - return -1; - } - } - else - *p_output = _PyUnicode_FromASCII(p, len); - PyMem_Free(p); - return 0; -} - -/* formatlong() emulates the format codes d, u, o, x and X, and - * the F_ALT flag, for Python's long (unbounded) ints. It's not used for - * Python's regular ints. - * Return value: a new PyUnicodeObject*, or NULL if error. - * The output string is of the form - * "-"? ("0x" | "0X")? digit+ - * "0x"/"0X" are present only for x and X conversions, with F_ALT - * set in flags. The case of hex digits will be correct, - * There will be at least prec digits, zero-filled on the left if - * necessary to get that many. - * val object to be converted - * flags bitmask of format flags; only F_ALT is looked at - * prec minimum number of digits; 0-fill on left if needed - * type a character in [duoxX]; u acts the same as d - * - * CAUTION: o, x and X conversions on regular ints can never - * produce a '-' sign, but can for Python's unbounded ints. - */ -PyObject * -_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) -{ - PyObject *result = NULL; - char *buf; - Py_ssize_t i; - int sign; /* 1 if '-', else 0 */ - int len; /* number of characters */ - Py_ssize_t llen; - int numdigits; /* len == numnondigits + numdigits */ - int numnondigits = 0; - - /* Avoid exceeding SSIZE_T_MAX */ - if (prec > INT_MAX-3) { - PyErr_SetString(PyExc_OverflowError, - "precision too large"); - return NULL; - } - - assert(PyLong_Check(val)); - - switch (type) { - default: - Py_UNREACHABLE(); - case 'd': - case 'i': - case 'u': - /* int and int subclasses should print numerically when a numeric */ - /* format code is used (see issue18780) */ - result = PyNumber_ToBase(val, 10); - break; - case 'o': - numnondigits = 2; - result = PyNumber_ToBase(val, 8); - break; - case 'x': - case 'X': - numnondigits = 2; - result = PyNumber_ToBase(val, 16); - break; - } - if (!result) - return NULL; - - assert(unicode_modifiable(result)); - assert(PyUnicode_IS_ASCII(result)); - - /* To modify the string in-place, there can only be one reference. */ - if (!_PyObject_IsUniquelyReferenced(result)) { - Py_DECREF(result); - PyErr_BadInternalCall(); - return NULL; - } - buf = PyUnicode_DATA(result); - llen = PyUnicode_GET_LENGTH(result); - if (llen > INT_MAX) { - Py_DECREF(result); - PyErr_SetString(PyExc_ValueError, - "string too large in _PyUnicode_FormatLong"); - return NULL; - } - len = (int)llen; - sign = buf[0] == '-'; - numnondigits += sign; - numdigits = len - numnondigits; - assert(numdigits > 0); - - /* Get rid of base marker unless F_ALT */ - if (((alt) == 0 && - (type == 'o' || type == 'x' || type == 'X'))) { - assert(buf[sign] == '0'); - assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || - buf[sign+1] == 'o'); - numnondigits -= 2; - buf += 2; - len -= 2; - if (sign) - buf[0] = '-'; - assert(len == numnondigits + numdigits); - assert(numdigits > 0); - } - - /* Fill with leading zeroes to meet minimum width. */ - if (prec > numdigits) { - PyObject *r1 = PyBytes_FromStringAndSize(NULL, - numnondigits + prec); - char *b1; - if (!r1) { - Py_DECREF(result); - return NULL; - } - b1 = PyBytes_AS_STRING(r1); - for (i = 0; i < numnondigits; ++i) - *b1++ = *buf++; - for (i = 0; i < prec - numdigits; i++) - *b1++ = '0'; - for (i = 0; i < numdigits; i++) - *b1++ = *buf++; - *b1 = '\0'; - Py_SETREF(result, r1); - buf = PyBytes_AS_STRING(result); - len = numnondigits + prec; - } - - /* Fix up case for hex conversions. */ - if (type == 'X') { - /* Need to convert all lower case letters to upper case. - and need to convert 0x to 0X (and -0x to -0X). */ - for (i = 0; i < len; i++) - if (buf[i] >= 'a' && buf[i] <= 'x') - buf[i] -= 'a'-'A'; - } - if (!PyUnicode_Check(result) - || buf != PyUnicode_DATA(result)) { - PyObject *unicode; - unicode = _PyUnicode_FromASCII(buf, len); - Py_SETREF(result, unicode); - } - else if (len != PyUnicode_GET_LENGTH(result)) { - if (PyUnicode_Resize(&result, len) < 0) - Py_CLEAR(result); - } - return result; -} - -/* Format an integer or a float as an integer. - * Return 1 if the number has been formatted into the writer, - * 0 if the number has been formatted into *p_output - * -1 and raise an exception on error */ -static int -mainformatlong(PyObject *v, - struct unicode_format_arg_t *arg, - PyObject **p_output, - _PyUnicodeWriter *writer) -{ - PyObject *iobj, *res; - char type = (char)arg->ch; - - if (!PyNumber_Check(v)) - goto wrongtype; - - /* make sure number is a type of integer for o, x, and X */ - if (!PyLong_Check(v)) { - if (type == 'o' || type == 'x' || type == 'X') { - iobj = _PyNumber_Index(v); - } - else { - iobj = PyNumber_Long(v); - } - if (iobj == NULL ) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - goto wrongtype; - return -1; - } - assert(PyLong_Check(iobj)); - } - else { - iobj = Py_NewRef(v); - } - - if (PyLong_CheckExact(v) - && arg->width == -1 && arg->prec == -1 - && !(arg->flags & (F_SIGN | F_BLANK)) - && type != 'X') - { - /* Fast path */ - int alternate = arg->flags & F_ALT; - int base; - - switch(type) - { - default: - Py_UNREACHABLE(); - case 'd': - case 'i': - case 'u': - base = 10; - break; - case 'o': - base = 8; - break; - case 'x': - case 'X': - base = 16; - break; - } - - if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { - Py_DECREF(iobj); - return -1; - } - Py_DECREF(iobj); - return 1; - } - - res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); - Py_DECREF(iobj); - if (res == NULL) - return -1; - *p_output = res; - return 0; - -wrongtype: - switch(type) - { - case 'o': - case 'x': - case 'X': - PyErr_Format(PyExc_TypeError, - "%%%c format: an integer is required, " - "not %.200s", - type, Py_TYPE(v)->tp_name); - break; - default: - PyErr_Format(PyExc_TypeError, - "%%%c format: a real number is required, " - "not %.200s", - type, Py_TYPE(v)->tp_name); - break; - } - return -1; -} - -static Py_UCS4 -formatchar(PyObject *v) -{ - /* presume that the buffer is at least 3 characters long */ - if (PyUnicode_Check(v)) { - if (PyUnicode_GET_LENGTH(v) == 1) { - return PyUnicode_READ_CHAR(v, 0); - } - PyErr_Format(PyExc_TypeError, - "%%c requires an int or a unicode character, " - "not a string of length %zd", - PyUnicode_GET_LENGTH(v)); - return (Py_UCS4) -1; - } - else { - int overflow; - long x = PyLong_AsLongAndOverflow(v, &overflow); - if (x == -1 && PyErr_Occurred()) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) { - PyErr_Format(PyExc_TypeError, - "%%c requires an int or a unicode character, not %T", - v); - return (Py_UCS4) -1; - } - return (Py_UCS4) -1; - } - - if (x < 0 || x > MAX_UNICODE) { - /* this includes an overflow in converting to C long */ - PyErr_SetString(PyExc_OverflowError, - "%c arg not in range(0x110000)"); - return (Py_UCS4) -1; - } - - return (Py_UCS4) x; - } -} - -/* Parse options of an argument: flags, width, precision. - Handle also "%(name)" syntax. - - Return 0 if the argument has been formatted into arg->str. - Return 1 if the argument has been written into ctx->writer, - Raise an exception and return -1 on error. */ -static int -unicode_format_arg_parse(struct unicode_formatter_t *ctx, - struct unicode_format_arg_t *arg) -{ -#define FORMAT_READ(ctx) \ - PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) - - PyObject *v; - - if (arg->ch == '(') { - /* Get argument value from a dictionary. Example: "%(name)s". */ - Py_ssize_t keystart; - Py_ssize_t keylen; - PyObject *key; - int pcount = 1; - - if (ctx->dict == NULL) { - PyErr_SetString(PyExc_TypeError, - "format requires a mapping"); - return -1; - } - ++ctx->fmtpos; - --ctx->fmtcnt; - keystart = ctx->fmtpos; - /* Skip over balanced parentheses */ - while (pcount > 0 && --ctx->fmtcnt >= 0) { - arg->ch = FORMAT_READ(ctx); - if (arg->ch == ')') - --pcount; - else if (arg->ch == '(') - ++pcount; - ctx->fmtpos++; - } - keylen = ctx->fmtpos - keystart - 1; - if (ctx->fmtcnt < 0 || pcount > 0) { - PyErr_SetString(PyExc_ValueError, - "incomplete format key"); - return -1; - } - key = PyUnicode_Substring(ctx->fmtstr, - keystart, keystart + keylen); - if (key == NULL) - return -1; - if (ctx->args_owned) { - ctx->args_owned = 0; - Py_DECREF(ctx->args); - } - ctx->args = PyObject_GetItem(ctx->dict, key); - Py_DECREF(key); - if (ctx->args == NULL) - return -1; - ctx->args_owned = 1; - ctx->arglen = -1; - ctx->argidx = -2; - } - - /* Parse flags. Example: "%+i" => flags=F_SIGN. */ - while (--ctx->fmtcnt >= 0) { - arg->ch = FORMAT_READ(ctx); - ctx->fmtpos++; - switch (arg->ch) { - case '-': arg->flags |= F_LJUST; continue; - case '+': arg->flags |= F_SIGN; continue; - case ' ': arg->flags |= F_BLANK; continue; - case '#': arg->flags |= F_ALT; continue; - case '0': arg->flags |= F_ZERO; continue; - } - break; - } - - /* Parse width. Example: "%10s" => width=10 */ - if (arg->ch == '*') { - v = unicode_format_getnextarg(ctx); - if (v == NULL) - return -1; - if (!PyLong_Check(v)) { - PyErr_SetString(PyExc_TypeError, - "* wants int"); - return -1; - } - arg->width = PyLong_AsSsize_t(v); - if (arg->width == -1 && PyErr_Occurred()) - return -1; - if (arg->width < 0) { - arg->flags |= F_LJUST; - arg->width = -arg->width; - } - if (--ctx->fmtcnt >= 0) { - arg->ch = FORMAT_READ(ctx); - ctx->fmtpos++; - } - } - else if (arg->ch >= '0' && arg->ch <= '9') { - arg->width = arg->ch - '0'; - while (--ctx->fmtcnt >= 0) { - arg->ch = FORMAT_READ(ctx); - ctx->fmtpos++; - if (arg->ch < '0' || arg->ch > '9') - break; - /* Since arg->ch is unsigned, the RHS would end up as unsigned, - mixing signed and unsigned comparison. Since arg->ch is between - '0' and '9', casting to int is safe. */ - if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { - PyErr_SetString(PyExc_ValueError, - "width too big"); - return -1; - } - arg->width = arg->width*10 + (arg->ch - '0'); - } - } - - /* Parse precision. Example: "%.3f" => prec=3 */ - if (arg->ch == '.') { - arg->prec = 0; - if (--ctx->fmtcnt >= 0) { - arg->ch = FORMAT_READ(ctx); - ctx->fmtpos++; - } - if (arg->ch == '*') { - v = unicode_format_getnextarg(ctx); - if (v == NULL) - return -1; - if (!PyLong_Check(v)) { - PyErr_SetString(PyExc_TypeError, - "* wants int"); - return -1; - } - arg->prec = PyLong_AsInt(v); - if (arg->prec == -1 && PyErr_Occurred()) - return -1; - if (arg->prec < 0) - arg->prec = 0; - if (--ctx->fmtcnt >= 0) { - arg->ch = FORMAT_READ(ctx); - ctx->fmtpos++; - } - } - else if (arg->ch >= '0' && arg->ch <= '9') { - arg->prec = arg->ch - '0'; - while (--ctx->fmtcnt >= 0) { - arg->ch = FORMAT_READ(ctx); - ctx->fmtpos++; - if (arg->ch < '0' || arg->ch > '9') - break; - if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { - PyErr_SetString(PyExc_ValueError, - "precision too big"); - return -1; - } - arg->prec = arg->prec*10 + (arg->ch - '0'); - } - } - } - - /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ - if (ctx->fmtcnt >= 0) { - if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { - if (--ctx->fmtcnt >= 0) { - arg->ch = FORMAT_READ(ctx); - ctx->fmtpos++; - } - } - } - if (ctx->fmtcnt < 0) { - PyErr_SetString(PyExc_ValueError, - "incomplete format"); - return -1; - } - return 0; - -#undef FORMAT_READ -} - -/* Format one argument. Supported conversion specifiers: - - - "s", "r", "a": any type - - "i", "d", "u": int or float - - "o", "x", "X": int - - "e", "E", "f", "F", "g", "G": float - - "c": int or str (1 character) - - When possible, the output is written directly into the Unicode writer - (ctx->writer). A string is created when padding is required. - - Return 0 if the argument has been formatted into *p_str, - 1 if the argument has been written into ctx->writer, - -1 on error. */ -static int -unicode_format_arg_format(struct unicode_formatter_t *ctx, - struct unicode_format_arg_t *arg, - PyObject **p_str) -{ - PyObject *v; - _PyUnicodeWriter *writer = &ctx->writer; - - if (ctx->fmtcnt == 0) - ctx->writer.overallocate = 0; - - v = unicode_format_getnextarg(ctx); - if (v == NULL) - return -1; - - - switch (arg->ch) { - case 's': - case 'r': - case 'a': - if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { - /* Fast path */ - if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) - return -1; - return 1; - } - - if (PyUnicode_CheckExact(v) && arg->ch == 's') { - *p_str = Py_NewRef(v); - } - else { - if (arg->ch == 's') - *p_str = PyObject_Str(v); - else if (arg->ch == 'r') - *p_str = PyObject_Repr(v); - else - *p_str = PyObject_ASCII(v); - } - break; - - case 'i': - case 'd': - case 'u': - case 'o': - case 'x': - case 'X': - { - int ret = mainformatlong(v, arg, p_str, writer); - if (ret != 0) - return ret; - arg->sign = 1; - break; - } - - case 'e': - case 'E': - case 'f': - case 'F': - case 'g': - case 'G': - if (arg->width == -1 && arg->prec == -1 - && !(arg->flags & (F_SIGN | F_BLANK))) - { - /* Fast path */ - if (formatfloat(v, arg, NULL, writer) == -1) - return -1; - return 1; - } - - arg->sign = 1; - if (formatfloat(v, arg, p_str, NULL) == -1) - return -1; - break; - - case 'c': - { - Py_UCS4 ch = formatchar(v); - if (ch == (Py_UCS4) -1) - return -1; - if (arg->width == -1 && arg->prec == -1) { - /* Fast path */ - if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) - return -1; - return 1; - } - *p_str = PyUnicode_FromOrdinal(ch); - break; - } - - default: - PyErr_Format(PyExc_ValueError, - "unsupported format character '%c' (0x%x) " - "at index %zd", - (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', - (int)arg->ch, - ctx->fmtpos - 1); - return -1; - } - if (*p_str == NULL) - return -1; - assert (PyUnicode_Check(*p_str)); - return 0; -} - -static int -unicode_format_arg_output(struct unicode_formatter_t *ctx, - struct unicode_format_arg_t *arg, - PyObject *str) -{ - Py_ssize_t len; - int kind; - const void *pbuf; - Py_ssize_t pindex; - Py_UCS4 signchar; - Py_ssize_t buflen; - Py_UCS4 maxchar; - Py_ssize_t sublen; - _PyUnicodeWriter *writer = &ctx->writer; - Py_UCS4 fill; - - fill = ' '; - if (arg->sign && arg->flags & F_ZERO) - fill = '0'; - - len = PyUnicode_GET_LENGTH(str); - if ((arg->width == -1 || arg->width <= len) - && (arg->prec == -1 || arg->prec >= len) - && !(arg->flags & (F_SIGN | F_BLANK))) - { - /* Fast path */ - if (_PyUnicodeWriter_WriteStr(writer, str) == -1) - return -1; - return 0; - } - - /* Truncate the string for "s", "r" and "a" formats - if the precision is set */ - if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { - if (arg->prec >= 0 && len > arg->prec) - len = arg->prec; - } - - /* Adjust sign and width */ - kind = PyUnicode_KIND(str); - pbuf = PyUnicode_DATA(str); - pindex = 0; - signchar = '\0'; - if (arg->sign) { - Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); - if (ch == '-' || ch == '+') { - signchar = ch; - len--; - pindex++; - } - else if (arg->flags & F_SIGN) - signchar = '+'; - else if (arg->flags & F_BLANK) - signchar = ' '; - else - arg->sign = 0; - } - if (arg->width < len) - arg->width = len; - - /* Prepare the writer */ - maxchar = writer->maxchar; - if (!(arg->flags & F_LJUST)) { - if (arg->sign) { - if ((arg->width-1) > len) - maxchar = Py_MAX(maxchar, fill); - } - else { - if (arg->width > len) - maxchar = Py_MAX(maxchar, fill); - } - } - if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { - Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); - maxchar = Py_MAX(maxchar, strmaxchar); - } - - buflen = arg->width; - if (arg->sign && len == arg->width) - buflen++; - if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) - return -1; - - /* Write the sign if needed */ - if (arg->sign) { - if (fill != ' ') { - PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); - writer->pos += 1; - } - if (arg->width > len) - arg->width--; - } - - /* Write the numeric prefix for "x", "X" and "o" formats - if the alternate form is used. - For example, write "0x" for the "%#x" format. */ - if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { - assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); - assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); - if (fill != ' ') { - PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); - PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); - writer->pos += 2; - pindex += 2; - } - arg->width -= 2; - if (arg->width < 0) - arg->width = 0; - len -= 2; - } - - /* Pad left with the fill character if needed */ - if (arg->width > len && !(arg->flags & F_LJUST)) { - sublen = arg->width - len; - _PyUnicode_Fill(writer->kind, writer->data, fill, writer->pos, sublen); - writer->pos += sublen; - arg->width = len; - } - - /* If padding with spaces: write sign if needed and/or numeric prefix if - the alternate form is used */ - if (fill == ' ') { - if (arg->sign) { - PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); - writer->pos += 1; - } - if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { - assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); - assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); - PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); - PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); - writer->pos += 2; - pindex += 2; - } - } - - /* Write characters */ - if (len) { - _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, - str, pindex, len); - writer->pos += len; - } - - /* Pad right with the fill character if needed */ - if (arg->width > len) { - sublen = arg->width - len; - _PyUnicode_Fill(writer->kind, writer->data, ' ', writer->pos, sublen); - writer->pos += sublen; - } - return 0; -} - -/* Helper of PyUnicode_Format(): format one arg. - Return 0 on success, raise an exception and return -1 on error. */ -static int -unicode_format_arg(struct unicode_formatter_t *ctx) -{ - struct unicode_format_arg_t arg; - PyObject *str; - int ret; - - arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); - if (arg.ch == '%') { - ctx->fmtpos++; - ctx->fmtcnt--; - if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0) - return -1; - return 0; - } - arg.flags = 0; - arg.width = -1; - arg.prec = -1; - arg.sign = 0; - str = NULL; - - ret = unicode_format_arg_parse(ctx, &arg); - if (ret == -1) - return -1; - - ret = unicode_format_arg_format(ctx, &arg, &str); - if (ret == -1) - return -1; - - if (ret != 1) { - ret = unicode_format_arg_output(ctx, &arg, str); - Py_DECREF(str); - if (ret == -1) - return -1; - } - - if (ctx->dict && (ctx->argidx < ctx->arglen)) { - PyErr_SetString(PyExc_TypeError, - "not all arguments converted during string formatting"); - return -1; - } - return 0; -} - -PyObject * -PyUnicode_Format(PyObject *format, PyObject *args) -{ - struct unicode_formatter_t ctx; - - if (format == NULL || args == NULL) { - PyErr_BadInternalCall(); - return NULL; - } - - if (ensure_unicode(format) < 0) - return NULL; - - ctx.fmtstr = format; - ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); - ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); - ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); - ctx.fmtpos = 0; - - _PyUnicodeWriter_Init(&ctx.writer); - ctx.writer.min_length = ctx.fmtcnt + 100; - ctx.writer.overallocate = 1; - - if (PyTuple_Check(args)) { - ctx.arglen = PyTuple_Size(args); - ctx.argidx = 0; - } - else { - ctx.arglen = -1; - ctx.argidx = -2; - } - ctx.args_owned = 0; - if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) - ctx.dict = args; - else - ctx.dict = NULL; - ctx.args = args; - - while (--ctx.fmtcnt >= 0) { - if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { - Py_ssize_t nonfmtpos; - - nonfmtpos = ctx.fmtpos++; - while (ctx.fmtcnt >= 0 && - PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { - ctx.fmtpos++; - ctx.fmtcnt--; - } - if (ctx.fmtcnt < 0) { - ctx.fmtpos--; - ctx.writer.overallocate = 0; - } - - if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, - nonfmtpos, ctx.fmtpos) < 0) - goto onError; - } - else { - ctx.fmtpos++; - if (unicode_format_arg(&ctx) == -1) - goto onError; - } - } - - if (ctx.argidx < ctx.arglen && !ctx.dict) { - PyErr_SetString(PyExc_TypeError, - "not all arguments converted during string formatting"); - goto onError; - } - - if (ctx.args_owned) { - Py_DECREF(ctx.args); - } - return _PyUnicodeWriter_Finish(&ctx.writer); - - onError: - _PyUnicodeWriter_Dealloc(&ctx.writer); - if (ctx.args_owned) { - Py_DECREF(ctx.args); - } - return NULL; -} - static PyObject * unicode_subtype_new(PyTypeObject *type, PyObject *unicode); diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index c4a11fa9b242bd..02b6f35798f845 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -164,6 +164,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 7bbbec2c9887bf..39462a6380cd21 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -481,6 +481,9 @@ Source Files + + Source Files + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index e2e1e415827e6f..2657ee5c444e60 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -558,6 +558,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 7e7ed9c2ae6c43..9c12be6e9356a6 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -1271,6 +1271,9 @@ Objects + + Objects + Objects