From af02e1c85a66009cdc645a64de7d7ee1335c8301 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 16 Dec 2011 23:56:01 +0100 Subject: [PATCH] Add PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale() * PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale() decode a string from the current locale encoding * _Py_char2wchar() writes an "error code" in the size argument to indicate if the function failed because of memory allocation failure or because of a decoding error. The function doesn't write the error message directly to stderr. * Fix time.strftime() (if wcsftime() is missing): decode strftime() result from the current locale encoding, not from the filesystem encoding. --- Doc/c-api/unicode.rst | 40 +++++++++++++++++ Include/unicodeobject.h | 22 ++++++++++ Modules/_localemodule.c | 57 +++++-------------------- Modules/main.c | 13 +++--- Modules/timemodule.c | 6 +-- Objects/unicodeobject.c | 95 +++++++++++++++++++++++++++++++++-------- Python/fileutils.c | 25 +++++++---- 7 files changed, 174 insertions(+), 84 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 81ed54045868e7..0bf2eea6f100f0 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -699,6 +699,39 @@ Extension modules can continue using them, as they will not be removed in Python throughout the interpreter whenever coercion to Unicode is needed. +Locale Encoding +""""""""""""""" + +The current locale encoding can be used to decode text from the operating +system. + +.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape) + + Decode a string from the current locale encoding. The decoder is strict if + *surrogateescape* is equal to zero, otherwise it uses the + ``'surrogateescape'`` error handler (:pep:`383`) to escape undecodable + bytes. If a byte sequence can be decoded as a surrogate character and + *surrogateescape* is not equal to zero, the byte sequence is escaped using + the ``'surrogateescape'`` error handler instead of being decoded. *str* + must end with a null character but cannot contain embedded null character. + + .. seealso:: + + Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from + :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at + Python startup). + + .. versionadded:: 3.3 + + +.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape) + + Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string + length using :c:func:`strlen`. + + .. versionadded:: 3.3 + + File System Encoding """""""""""""""""""" @@ -739,6 +772,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function: If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the locale encoding. + .. seealso:: + + :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the + locale encoding and cannot be modified later. If you need to decode a + string from the current locale encoding, use + :c:func:`PyUnicode_DecodeLocaleAndSize`. + .. versionchanged:: 3.2 Use ``'strict'`` error handler on Windows. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index cd35ae629a95e9..5f073e0625acde 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1595,6 +1595,28 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( ); #endif +/* --- Locale encoding --------------------------------------------------- */ + +/* Decode a string from the current locale encoding. The decoder is strict if + *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' + error handler (PEP 383) to escape undecodable bytes. If a byte sequence can + be decoded as a surrogate character and *surrogateescape* is not equal to + zero, the byte sequence is escaped using the 'surrogateescape' error handler + instead of being decoded. *str* must end with a null character but cannot + contain embedded null character. */ + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( + const char *str, + Py_ssize_t len, + int surrogateescape); + +/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string + length using strlen(). */ + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( + const char *str, + int surrogateescape); + /* --- File system encoding ---------------------------------------------- */ /* ParseTuple converter: encode str objects to bytes using diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c index 9bba1b39cf7c33..1cab7c0a7482be 100644 --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for POSIX locales."); static PyObject *Error; -/* Convert a char* to a Unicode object according to the current locale */ -static PyObject* -str2uni(const char* s) -{ -#ifdef HAVE_BROKEN_MBSTOWCS - size_t needed = strlen(s); -#else - size_t needed = mbstowcs(NULL, s, 0); -#endif - size_t res1; - wchar_t smallbuf[30]; - wchar_t *dest; - PyObject *res2; - if (needed == (size_t)-1) { - PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string"); - return NULL; - } - if (needed*sizeof(wchar_t) < sizeof(smallbuf)) - dest = smallbuf; - else { - dest = PyMem_Malloc((needed+1)*sizeof(wchar_t)); - if (!dest) - return PyErr_NoMemory(); - } - /* This shouldn't fail now */ - res1 = mbstowcs(dest, s, needed+1); -#ifdef HAVE_BROKEN_MBSTOWCS - assert(res1 != (size_t)-1); -#else - assert(res1 == needed); -#endif - res2 = PyUnicode_FromWideChar(dest, res1); - if (dest != smallbuf) - PyMem_Free(dest); - return res2; -} - /* support functions for formatting floating point numbers */ PyDoc_STRVAR(setlocale__doc__, @@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args) PyErr_SetString(Error, "unsupported locale setting"); return NULL; } - result_object = str2uni(result); + result_object = PyUnicode_DecodeLocale(result, 0); if (!result_object) return NULL; } else { @@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args) PyErr_SetString(Error, "locale query failed"); return NULL; } - result_object = str2uni(result); + result_object = PyUnicode_DecodeLocale(result, 0); } return result_object; } @@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self) involved herein */ #define RESULT_STRING(s)\ - x = str2uni(l->s); \ + x = PyUnicode_DecodeLocale(l->s, 0); \ if (!x) goto failed;\ PyDict_SetItemString(result, #s, x);\ Py_XDECREF(x) @@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyObject* args) instead of an empty string for nl_langinfo(ERA). */ const char *result = nl_langinfo(item); result = result != NULL ? result : ""; - return str2uni(result); + return PyUnicode_DecodeLocale(result, 0); } PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant"); return NULL; @@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject *args) char *in; if (!PyArg_ParseTuple(args, "s", &in)) return 0; - return str2uni(gettext(in)); + return PyUnicode_DecodeLocale(gettext(in), 0); } PyDoc_STRVAR(dgettext__doc__, @@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject *args) char *domain, *in; if (!PyArg_ParseTuple(args, "zs", &domain, &in)) return 0; - return str2uni(dgettext(domain, in)); + return PyUnicode_DecodeLocale(dgettext(domain, in), 0); } PyDoc_STRVAR(dcgettext__doc__, @@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObject *args) int category; if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category)) return 0; - return str2uni(dcgettext(domain,msgid,category)); + return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0); } PyDoc_STRVAR(textdomain__doc__, @@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObject* args) PyErr_SetFromErrno(PyExc_OSError); return NULL; } - return str2uni(domain); + return PyUnicode_DecodeLocale(domain, 0); } PyDoc_STRVAR(bindtextdomain__doc__, @@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyObject*args) PyErr_SetFromErrno(PyExc_OSError); return NULL; } - result = str2uni(current_dirname); + result = PyUnicode_DecodeLocale(current_dirname, 0); Py_XDECREF(dirname_bytes); return result; } @@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args) return NULL; codeset = bind_textdomain_codeset(domain, codeset); if (codeset) - return str2uni(codeset); + return PyUnicode_DecodeLocale(codeset, 0); Py_RETURN_NONE; } #endif diff --git a/Modules/main.c b/Modules/main.c index d4c3314d24f998..4899378dc85bae 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv) /* Use utf-8 on Mac OS X */ unicode = PyUnicode_FromString(p); #else - wchar_t *wchar; - size_t len; - wchar = _Py_char2wchar(p, &len); - if (wchar == NULL) - continue; - unicode = PyUnicode_FromWideChar(wchar, len); - PyMem_Free(wchar); + unicode = PyUnicode_DecodeLocale(p, 1); #endif - if (unicode == NULL) + if (unicode == NULL) { + /* ignore errors */ + PyErr_Clear(); continue; + } PySys_AddWarnOptionUnicode(unicode); Py_DECREF(unicode); } diff --git a/Modules/timemodule.c b/Modules/timemodule.c index 001b311731b4bd..a46c4f11e40c57 100644 --- a/Modules/timemodule.c +++ b/Modules/timemodule.c @@ -532,7 +532,7 @@ time_strftime(PyObject *self, PyObject *args) #ifdef HAVE_WCSFTIME ret = PyUnicode_FromWideChar(outbuf, buflen); #else - ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen); + ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1); #endif PyMem_Free(outbuf); break; @@ -764,8 +764,8 @@ PyInit_timezone(PyObject *m) { #endif /* PYOS_OS2 */ #endif PyModule_AddIntConstant(m, "daylight", daylight); - otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0])); - otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1])); + otz0 = PyUnicode_DecodeLocale(tzname[0], 1); + otz1 = PyUnicode_DecodeLocale(tzname[1], 1); PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1)); #else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/ #ifdef HAVE_STRUCT_TM_TM_ZONE diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5758ffacf31aae..7444c8b4ba0c12 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3234,6 +3234,83 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode, return NULL; } +PyObject* +PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, + int surrogateescape) +{ + wchar_t smallbuf[256]; + size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); + wchar_t *wstr; + size_t wlen, wlen2; + PyObject *unicode; + + if (str[len] != '\0' || len != strlen(str)) { + PyErr_SetString(PyExc_TypeError, "embedded null character"); + return NULL; + } + + if (surrogateescape) + { + wstr = _Py_char2wchar(str, &wlen); + if (wstr == NULL) { + if (wlen == (size_t)-1) + PyErr_NoMemory(); + else + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + + unicode = PyUnicode_FromWideChar(wstr, wlen); + PyMem_Free(wstr); + } + else { +#ifndef HAVE_BROKEN_MBSTOWCS + wlen = mbstowcs(NULL, str, 0); +#else + wlen = len; +#endif + if (wlen == (size_t)-1) { + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + if (wlen+1 <= smallbuf_len) { + wstr = smallbuf; + } + else { + if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) + return PyErr_NoMemory(); + + wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); + if (!wstr) + return PyErr_NoMemory(); + } + + /* This shouldn't fail now */ + wlen2 = mbstowcs(wstr, str, wlen+1); + if (wlen2 == (size_t)-1) { + if (wstr != smallbuf) + PyMem_Free(wstr); + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } +#ifdef HAVE_BROKEN_MBSTOWCS + assert(wlen2 == wlen); +#endif + unicode = PyUnicode_FromWideChar(wstr, wlen2); + if (wstr != smallbuf) + PyMem_Free(wstr); + } + return unicode; +} + +PyObject* +PyUnicode_DecodeLocale(const char *str, int surrogateescape) +{ + Py_ssize_t size = (Py_ssize_t)strlen(str); + return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape); +} + + PyObject* PyUnicode_DecodeFSDefault(const char *s) { Py_ssize_t size = (Py_ssize_t)strlen(s); @@ -3264,23 +3341,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) "surrogateescape"); } else { - /* locale encoding with surrogateescape */ - wchar_t *wchar; - PyObject *unicode; - size_t len; - - if (s[size] != '\0' || size != strlen(s)) { - PyErr_SetString(PyExc_TypeError, "embedded NUL character"); - return NULL; - } - - wchar = _Py_char2wchar(s, &len); - if (wchar == NULL) - return PyErr_NoMemory(); - - unicode = PyUnicode_FromWideChar(wchar, len); - PyMem_Free(wchar); - return unicode; + return PyUnicode_DecodeLocaleAndSize(s, size, 1); } #endif } diff --git a/Python/fileutils.c b/Python/fileutils.c index 0afa415d59b3f5..0aad2200fb1fe6 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -16,7 +16,9 @@ Return a pointer to a newly allocated wide character string (use PyMem_Free() to free the memory) and write the number of written wide characters excluding the null character into *size if size is not NULL, or - NULL on error (conversion or memory allocation error). + NULL on error (decoding or memory allocation error). If size is not NULL, + *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding + error. Conversion errors should never happen, unless there is a bug in the C library. */ @@ -82,8 +84,9 @@ _Py_char2wchar(const char* arg, size_t *size) since we provide everything that we have - unless there is a bug in the C library, or I misunderstood how mbrtowc works. */ - fprintf(stderr, "unexpected mbrtowc result -2\n"); PyMem_Free(res); + if (size != NULL) + *size = (size_t)-2; return NULL; } if (converted == (size_t)-1) { @@ -112,7 +115,8 @@ _Py_char2wchar(const char* arg, size_t *size) is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t)); - if (!res) goto oom; + if (!res) + goto oom; in = (unsigned char*)arg; out = res; while(*in) @@ -126,7 +130,8 @@ _Py_char2wchar(const char* arg, size_t *size) *size = out - res; return res; oom: - fprintf(stderr, "out of memory\n"); + if (size != NULL) + *size = (size_t)-1; return NULL; } @@ -137,10 +142,10 @@ _Py_char2wchar(const char* arg, size_t *size) This function is the reverse of _Py_char2wchar(). Return a pointer to a newly allocated byte string (use PyMem_Free() to free - the memory), or NULL on conversion or memory allocation error. + the memory), or NULL on encoding or memory allocation error. If error_pos is not NULL: *error_pos is the index of the invalid character - on conversion error, or (size_t)-1 otherwise. */ + on encoding error, or (size_t)-1 otherwise. */ char* _Py_wchar2char(const wchar_t *text, size_t *error_pos) { @@ -328,7 +333,7 @@ _Py_fopen(PyObject *path, const char *mode) #ifdef HAVE_READLINK /* Read value of symbolic link. Encode the path to the locale encoding, decode - the result from the locale encoding. */ + the result from the locale encoding. Return -1 on error. */ int _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz) @@ -372,7 +377,8 @@ _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz) #ifdef HAVE_REALPATH /* Return the canonicalized absolute pathname. Encode path to the locale - encoding, decode the result from the locale encoding. */ + encoding, decode the result from the locale encoding. + Return NULL on error. */ wchar_t* _Py_wrealpath(const wchar_t *path, @@ -410,7 +416,8 @@ _Py_wrealpath(const wchar_t *path, #endif /* Get the current directory. size is the buffer size in wide characters - including the null character. Decode the path from the locale encoding. */ + including the null character. Decode the path from the locale encoding. + Return NULL on error. */ wchar_t* _Py_wgetcwd(wchar_t *buf, size_t size)