Skip to content

Commit

Permalink
Add PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale()
Browse files Browse the repository at this point in the history
 * PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale() decode a string
   from the current locale encoding
 * _Py_char2wchar() writes an "error code" in the size argument to indicate
   if the function failed because of memory allocation failure or because of a
   decoding error. The function doesn't write the error message directly to
   stderr.
 * Fix time.strftime() (if wcsftime() is missing): decode strftime() result
   from the current locale encoding, not from the filesystem encoding.
  • Loading branch information
Victor Stinner committed Dec 16, 2011
1 parent 3607e3d commit af02e1c
Show file tree
Hide file tree
Showing 7 changed files with 174 additions and 84 deletions.
40 changes: 40 additions & 0 deletions Doc/c-api/unicode.rst
Expand Up @@ -699,6 +699,39 @@ Extension modules can continue using them, as they will not be removed in Python
throughout the interpreter whenever coercion to Unicode is needed.
Locale Encoding
"""""""""""""""
The current locale encoding can be used to decode text from the operating
system.
.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape)
Decode a string from the current locale encoding. The decoder is strict if
*surrogateescape* is equal to zero, otherwise it uses the
``'surrogateescape'`` error handler (:pep:`383`) to escape undecodable
bytes. If a byte sequence can be decoded as a surrogate character and
*surrogateescape* is not equal to zero, the byte sequence is escaped using
the ``'surrogateescape'`` error handler instead of being decoded. *str*
must end with a null character but cannot contain embedded null character.
.. seealso::
Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
Python startup).
.. versionadded:: 3.3
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape)
Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string
length using :c:func:`strlen`.
.. versionadded:: 3.3
File System Encoding
""""""""""""""""""""
Expand Down Expand Up @@ -739,6 +772,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
locale encoding.
.. seealso::
:c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
locale encoding and cannot be modified later. If you need to decode a
string from the current locale encoding, use
:c:func:`PyUnicode_DecodeLocaleAndSize`.
.. versionchanged:: 3.2
Use ``'strict'`` error handler on Windows.
Expand Down
22 changes: 22 additions & 0 deletions Include/unicodeobject.h
Expand Up @@ -1595,6 +1595,28 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
);
#endif

/* --- Locale encoding --------------------------------------------------- */

/* Decode a string from the current locale encoding. The decoder is strict if
*surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
be decoded as a surrogate character and *surrogateescape* is not equal to
zero, the byte sequence is escaped using the 'surrogateescape' error handler
instead of being decoded. *str* must end with a null character but cannot
contain embedded null character. */

PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
const char *str,
Py_ssize_t len,
int surrogateescape);

/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
length using strlen(). */

PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
const char *str,
int surrogateescape);

/* --- File system encoding ---------------------------------------------- */

/* ParseTuple converter: encode str objects to bytes using
Expand Down
57 changes: 10 additions & 47 deletions Modules/_localemodule.c
Expand Up @@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for POSIX locales.");

static PyObject *Error;

/* Convert a char* to a Unicode object according to the current locale */
static PyObject*
str2uni(const char* s)
{
#ifdef HAVE_BROKEN_MBSTOWCS
size_t needed = strlen(s);
#else
size_t needed = mbstowcs(NULL, s, 0);
#endif
size_t res1;
wchar_t smallbuf[30];
wchar_t *dest;
PyObject *res2;
if (needed == (size_t)-1) {
PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string");
return NULL;
}
if (needed*sizeof(wchar_t) < sizeof(smallbuf))
dest = smallbuf;
else {
dest = PyMem_Malloc((needed+1)*sizeof(wchar_t));
if (!dest)
return PyErr_NoMemory();
}
/* This shouldn't fail now */
res1 = mbstowcs(dest, s, needed+1);
#ifdef HAVE_BROKEN_MBSTOWCS
assert(res1 != (size_t)-1);
#else
assert(res1 == needed);
#endif
res2 = PyUnicode_FromWideChar(dest, res1);
if (dest != smallbuf)
PyMem_Free(dest);
return res2;
}

/* support functions for formatting floating point numbers */

PyDoc_STRVAR(setlocale__doc__,
Expand Down Expand Up @@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
PyErr_SetString(Error, "unsupported locale setting");
return NULL;
}
result_object = str2uni(result);
result_object = PyUnicode_DecodeLocale(result, 0);
if (!result_object)
return NULL;
} else {
Expand All @@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
PyErr_SetString(Error, "locale query failed");
return NULL;
}
result_object = str2uni(result);
result_object = PyUnicode_DecodeLocale(result, 0);
}
return result_object;
}
Expand All @@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self)
involved herein */

#define RESULT_STRING(s)\
x = str2uni(l->s); \
x = PyUnicode_DecodeLocale(l->s, 0); \
if (!x) goto failed;\
PyDict_SetItemString(result, #s, x);\
Py_XDECREF(x)
Expand Down Expand Up @@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyObject* args)
instead of an empty string for nl_langinfo(ERA). */
const char *result = nl_langinfo(item);
result = result != NULL ? result : "";
return str2uni(result);
return PyUnicode_DecodeLocale(result, 0);
}
PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant");
return NULL;
Expand All @@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject *args)
char *in;
if (!PyArg_ParseTuple(args, "s", &in))
return 0;
return str2uni(gettext(in));
return PyUnicode_DecodeLocale(gettext(in), 0);
}

PyDoc_STRVAR(dgettext__doc__,
Expand All @@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject *args)
char *domain, *in;
if (!PyArg_ParseTuple(args, "zs", &domain, &in))
return 0;
return str2uni(dgettext(domain, in));
return PyUnicode_DecodeLocale(dgettext(domain, in), 0);
}

PyDoc_STRVAR(dcgettext__doc__,
Expand All @@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObject *args)
int category;
if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category))
return 0;
return str2uni(dcgettext(domain,msgid,category));
return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0);
}

PyDoc_STRVAR(textdomain__doc__,
Expand All @@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObject* args)
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
return str2uni(domain);
return PyUnicode_DecodeLocale(domain, 0);
}

PyDoc_STRVAR(bindtextdomain__doc__,
Expand Down Expand Up @@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyObject*args)
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
result = str2uni(current_dirname);
result = PyUnicode_DecodeLocale(current_dirname, 0);
Py_XDECREF(dirname_bytes);
return result;
}
Expand All @@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
return NULL;
codeset = bind_textdomain_codeset(domain, codeset);
if (codeset)
return str2uni(codeset);
return PyUnicode_DecodeLocale(codeset, 0);
Py_RETURN_NONE;
}
#endif
Expand Down
13 changes: 5 additions & 8 deletions Modules/main.c
Expand Up @@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv)
/* Use utf-8 on Mac OS X */
unicode = PyUnicode_FromString(p);
#else
wchar_t *wchar;
size_t len;
wchar = _Py_char2wchar(p, &len);
if (wchar == NULL)
continue;
unicode = PyUnicode_FromWideChar(wchar, len);
PyMem_Free(wchar);
unicode = PyUnicode_DecodeLocale(p, 1);
#endif
if (unicode == NULL)
if (unicode == NULL) {
/* ignore errors */
PyErr_Clear();
continue;
}
PySys_AddWarnOptionUnicode(unicode);
Py_DECREF(unicode);
}
Expand Down
6 changes: 3 additions & 3 deletions Modules/timemodule.c
Expand Up @@ -532,7 +532,7 @@ time_strftime(PyObject *self, PyObject *args)
#ifdef HAVE_WCSFTIME
ret = PyUnicode_FromWideChar(outbuf, buflen);
#else
ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen);
ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1);
#endif
PyMem_Free(outbuf);
break;
Expand Down Expand Up @@ -764,8 +764,8 @@ PyInit_timezone(PyObject *m) {
#endif /* PYOS_OS2 */
#endif
PyModule_AddIntConstant(m, "daylight", daylight);
otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0]));
otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1]));
otz0 = PyUnicode_DecodeLocale(tzname[0], 1);
otz1 = PyUnicode_DecodeLocale(tzname[1], 1);
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
#ifdef HAVE_STRUCT_TM_TM_ZONE
Expand Down
95 changes: 78 additions & 17 deletions Objects/unicodeobject.c
Expand Up @@ -3234,6 +3234,83 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
return NULL;
}

PyObject*
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
int surrogateescape)
{
wchar_t smallbuf[256];
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
wchar_t *wstr;
size_t wlen, wlen2;
PyObject *unicode;

if (str[len] != '\0' || len != strlen(str)) {
PyErr_SetString(PyExc_TypeError, "embedded null character");
return NULL;
}

if (surrogateescape)
{
wstr = _Py_char2wchar(str, &wlen);
if (wstr == NULL) {
if (wlen == (size_t)-1)
PyErr_NoMemory();
else
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}

unicode = PyUnicode_FromWideChar(wstr, wlen);
PyMem_Free(wstr);
}
else {
#ifndef HAVE_BROKEN_MBSTOWCS
wlen = mbstowcs(NULL, str, 0);
#else
wlen = len;
#endif
if (wlen == (size_t)-1) {
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
if (wlen+1 <= smallbuf_len) {
wstr = smallbuf;
}
else {
if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
return PyErr_NoMemory();

wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
if (!wstr)
return PyErr_NoMemory();
}

/* This shouldn't fail now */
wlen2 = mbstowcs(wstr, str, wlen+1);
if (wlen2 == (size_t)-1) {
if (wstr != smallbuf)
PyMem_Free(wstr);
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
#ifdef HAVE_BROKEN_MBSTOWCS
assert(wlen2 == wlen);
#endif
unicode = PyUnicode_FromWideChar(wstr, wlen2);
if (wstr != smallbuf)
PyMem_Free(wstr);
}
return unicode;
}

PyObject*
PyUnicode_DecodeLocale(const char *str, int surrogateescape)
{
Py_ssize_t size = (Py_ssize_t)strlen(str);
return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
}


PyObject*
PyUnicode_DecodeFSDefault(const char *s) {
Py_ssize_t size = (Py_ssize_t)strlen(s);
Expand Down Expand Up @@ -3264,23 +3341,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
"surrogateescape");
}
else {
/* locale encoding with surrogateescape */
wchar_t *wchar;
PyObject *unicode;
size_t len;

if (s[size] != '\0' || size != strlen(s)) {
PyErr_SetString(PyExc_TypeError, "embedded NUL character");
return NULL;
}

wchar = _Py_char2wchar(s, &len);
if (wchar == NULL)
return PyErr_NoMemory();

unicode = PyUnicode_FromWideChar(wchar, len);
PyMem_Free(wchar);
return unicode;
return PyUnicode_DecodeLocaleAndSize(s, size, 1);
}
#endif
}
Expand Down

0 comments on commit af02e1c

Please sign in to comment.