Skip to content
Permalink
Browse files

Add PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale()

 * PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale() decode a string
   from the current locale encoding
 * _Py_char2wchar() writes an "error code" in the size argument to indicate
   if the function failed because of memory allocation failure or because of a
   decoding error. The function doesn't write the error message directly to
   stderr.
 * Fix time.strftime() (if wcsftime() is missing): decode strftime() result
   from the current locale encoding, not from the filesystem encoding.
  • Loading branch information...
Victor Stinner
Victor Stinner committed Dec 16, 2011
1 parent 3607e3d commit af02e1c85a66009cdc645a64de7d7ee1335c8301
Showing with 174 additions and 84 deletions.
  1. +40 −0 Doc/c-api/unicode.rst
  2. +22 −0 Include/unicodeobject.h
  3. +10 −47 Modules/_localemodule.c
  4. +5 −8 Modules/main.c
  5. +3 −3 Modules/timemodule.c
  6. +78 −17 Objects/unicodeobject.c
  7. +16 −9 Python/fileutils.c
@@ -699,6 +699,39 @@ Extension modules can continue using them, as they will not be removed in Python
throughout the interpreter whenever coercion to Unicode is needed.
Locale Encoding
"""""""""""""""
The current locale encoding can be used to decode text from the operating
system.
.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape)
Decode a string from the current locale encoding. The decoder is strict if
*surrogateescape* is equal to zero, otherwise it uses the
``'surrogateescape'`` error handler (:pep:`383`) to escape undecodable
bytes. If a byte sequence can be decoded as a surrogate character and
*surrogateescape* is not equal to zero, the byte sequence is escaped using
the ``'surrogateescape'`` error handler instead of being decoded. *str*
must end with a null character but cannot contain embedded null character.
.. seealso::
Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
Python startup).
.. versionadded:: 3.3
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape)
Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string
length using :c:func:`strlen`.
.. versionadded:: 3.3
File System Encoding
""""""""""""""""""""
@@ -739,6 +772,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
locale encoding.
.. seealso::
:c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
locale encoding and cannot be modified later. If you need to decode a
string from the current locale encoding, use
:c:func:`PyUnicode_DecodeLocaleAndSize`.
.. versionchanged:: 3.2
Use ``'strict'`` error handler on Windows.
@@ -1595,6 +1595,28 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
);
#endif

/* --- Locale encoding --------------------------------------------------- */

/* Decode a string from the current locale encoding. The decoder is strict if
*surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
be decoded as a surrogate character and *surrogateescape* is not equal to
zero, the byte sequence is escaped using the 'surrogateescape' error handler
instead of being decoded. *str* must end with a null character but cannot
contain embedded null character. */

PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
const char *str,
Py_ssize_t len,
int surrogateescape);

/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
length using strlen(). */

PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
const char *str,
int surrogateescape);

/* --- File system encoding ---------------------------------------------- */

/* ParseTuple converter: encode str objects to bytes using
@@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for POSIX locales.");

static PyObject *Error;

/* Convert a char* to a Unicode object according to the current locale */
static PyObject*
str2uni(const char* s)
{
#ifdef HAVE_BROKEN_MBSTOWCS
size_t needed = strlen(s);
#else
size_t needed = mbstowcs(NULL, s, 0);
#endif
size_t res1;
wchar_t smallbuf[30];
wchar_t *dest;
PyObject *res2;
if (needed == (size_t)-1) {
PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string");
return NULL;
}
if (needed*sizeof(wchar_t) < sizeof(smallbuf))
dest = smallbuf;
else {
dest = PyMem_Malloc((needed+1)*sizeof(wchar_t));
if (!dest)
return PyErr_NoMemory();
}
/* This shouldn't fail now */
res1 = mbstowcs(dest, s, needed+1);
#ifdef HAVE_BROKEN_MBSTOWCS
assert(res1 != (size_t)-1);
#else
assert(res1 == needed);
#endif
res2 = PyUnicode_FromWideChar(dest, res1);
if (dest != smallbuf)
PyMem_Free(dest);
return res2;
}

/* support functions for formatting floating point numbers */

PyDoc_STRVAR(setlocale__doc__,
@@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
PyErr_SetString(Error, "unsupported locale setting");
return NULL;
}
result_object = str2uni(result);
result_object = PyUnicode_DecodeLocale(result, 0);
if (!result_object)
return NULL;
} else {
@@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
PyErr_SetString(Error, "locale query failed");
return NULL;
}
result_object = str2uni(result);
result_object = PyUnicode_DecodeLocale(result, 0);
}
return result_object;
}
@@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self)
involved herein */

#define RESULT_STRING(s)\
x = str2uni(l->s); \
x = PyUnicode_DecodeLocale(l->s, 0); \
if (!x) goto failed;\
PyDict_SetItemString(result, #s, x);\
Py_XDECREF(x)
@@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyObject* args)
instead of an empty string for nl_langinfo(ERA). */
const char *result = nl_langinfo(item);
result = result != NULL ? result : "";
return str2uni(result);
return PyUnicode_DecodeLocale(result, 0);
}
PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant");
return NULL;
@@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject *args)
char *in;
if (!PyArg_ParseTuple(args, "s", &in))
return 0;
return str2uni(gettext(in));
return PyUnicode_DecodeLocale(gettext(in), 0);
}

PyDoc_STRVAR(dgettext__doc__,
@@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject *args)
char *domain, *in;
if (!PyArg_ParseTuple(args, "zs", &domain, &in))
return 0;
return str2uni(dgettext(domain, in));
return PyUnicode_DecodeLocale(dgettext(domain, in), 0);
}

PyDoc_STRVAR(dcgettext__doc__,
@@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObject *args)
int category;
if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category))
return 0;
return str2uni(dcgettext(domain,msgid,category));
return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0);
}

PyDoc_STRVAR(textdomain__doc__,
@@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObject* args)
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
return str2uni(domain);
return PyUnicode_DecodeLocale(domain, 0);
}

PyDoc_STRVAR(bindtextdomain__doc__,
@@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyObject*args)
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
result = str2uni(current_dirname);
result = PyUnicode_DecodeLocale(current_dirname, 0);
Py_XDECREF(dirname_bytes);
return result;
}
@@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
return NULL;
codeset = bind_textdomain_codeset(domain, codeset);
if (codeset)
return str2uni(codeset);
return PyUnicode_DecodeLocale(codeset, 0);
Py_RETURN_NONE;
}
#endif
@@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv)
/* Use utf-8 on Mac OS X */
unicode = PyUnicode_FromString(p);
#else
wchar_t *wchar;
size_t len;
wchar = _Py_char2wchar(p, &len);
if (wchar == NULL)
continue;
unicode = PyUnicode_FromWideChar(wchar, len);
PyMem_Free(wchar);
unicode = PyUnicode_DecodeLocale(p, 1);
#endif
if (unicode == NULL)
if (unicode == NULL) {
/* ignore errors */
PyErr_Clear();
continue;
}
PySys_AddWarnOptionUnicode(unicode);
Py_DECREF(unicode);
}
@@ -532,7 +532,7 @@ time_strftime(PyObject *self, PyObject *args)
#ifdef HAVE_WCSFTIME
ret = PyUnicode_FromWideChar(outbuf, buflen);
#else
ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen);
ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1);
#endif
PyMem_Free(outbuf);
break;
@@ -764,8 +764,8 @@ PyInit_timezone(PyObject *m) {
#endif /* PYOS_OS2 */
#endif
PyModule_AddIntConstant(m, "daylight", daylight);
otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0]));
otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1]));
otz0 = PyUnicode_DecodeLocale(tzname[0], 1);
otz1 = PyUnicode_DecodeLocale(tzname[1], 1);
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
#ifdef HAVE_STRUCT_TM_TM_ZONE
@@ -3234,6 +3234,83 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
return NULL;
}

PyObject*
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
int surrogateescape)
{
wchar_t smallbuf[256];
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
wchar_t *wstr;
size_t wlen, wlen2;
PyObject *unicode;

if (str[len] != '\0' || len != strlen(str)) {
PyErr_SetString(PyExc_TypeError, "embedded null character");
return NULL;
}

if (surrogateescape)
{
wstr = _Py_char2wchar(str, &wlen);
if (wstr == NULL) {
if (wlen == (size_t)-1)
PyErr_NoMemory();
else
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}

unicode = PyUnicode_FromWideChar(wstr, wlen);
PyMem_Free(wstr);
}
else {
#ifndef HAVE_BROKEN_MBSTOWCS
wlen = mbstowcs(NULL, str, 0);
#else
wlen = len;
#endif
if (wlen == (size_t)-1) {
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
if (wlen+1 <= smallbuf_len) {
wstr = smallbuf;
}
else {
if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
return PyErr_NoMemory();

wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
if (!wstr)
return PyErr_NoMemory();
}

/* This shouldn't fail now */
wlen2 = mbstowcs(wstr, str, wlen+1);
if (wlen2 == (size_t)-1) {
if (wstr != smallbuf)
PyMem_Free(wstr);
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
#ifdef HAVE_BROKEN_MBSTOWCS
assert(wlen2 == wlen);
#endif
unicode = PyUnicode_FromWideChar(wstr, wlen2);
if (wstr != smallbuf)
PyMem_Free(wstr);
}
return unicode;
}

PyObject*
PyUnicode_DecodeLocale(const char *str, int surrogateescape)
{
Py_ssize_t size = (Py_ssize_t)strlen(str);
return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
}


PyObject*
PyUnicode_DecodeFSDefault(const char *s) {
Py_ssize_t size = (Py_ssize_t)strlen(s);
@@ -3264,23 +3341,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
"surrogateescape");
}
else {
/* locale encoding with surrogateescape */
wchar_t *wchar;
PyObject *unicode;
size_t len;

if (s[size] != '\0' || size != strlen(s)) {
PyErr_SetString(PyExc_TypeError, "embedded NUL character");
return NULL;
}

wchar = _Py_char2wchar(s, &len);
if (wchar == NULL)
return PyErr_NoMemory();

unicode = PyUnicode_FromWideChar(wchar, len);
PyMem_Free(wchar);
return unicode;
return PyUnicode_DecodeLocaleAndSize(s, size, 1);
}
#endif
}

0 comments on commit af02e1c

Please sign in to comment.
You can’t perform that action at this time.