diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 5fa37963e07eff..f4317e7e2a888c 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -971,6 +971,12 @@ These are the UTF-8 codec APIs: returned buffer always has an extra null byte appended (not included in *size*), regardless of whether there are any other null code points. + If *size* is NULL and the *unicode* string contains embedded null + characters, raise an exception. To accept embedded null characters and + truncate on purpose at the first null byte, :c:func:`PyUnicode_AsUTF8Unsafe` + and :c:func:`PyUnicode_AsUTF8AndSize(unicode, &size) + ` can be used instead. + On error, set an exception, set *size* to ``-1`` (if it's not NULL) and return ``NULL``. @@ -987,15 +993,21 @@ These are the UTF-8 codec APIs: .. versionchanged:: 3.10 This function is a part of the :ref:`limited API `. + .. versionchanged:: 3.13 + Raise an exception if *size* is NULL and the string contains embedded + null characters. + .. c:function:: const char* PyUnicode_AsUTF8(PyObject *unicode) - As :c:func:`PyUnicode_AsUTF8AndSize`, but does not store the size. + Similar to :c:func:`PyUnicode_AsUTF8AndSize(unicode, NULL) + `, but does not store the size. Raise an exception if the *unicode* string contains embedded null - characters. To accept embedded null characters and truncate on purpose - at the first null byte, ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be - used instead. + characters. To accept embedded null characters and truncate on purpose at + the first null byte, :c:func:`PyUnicode_AsUTF8Unsafe` and + :c:func:`PyUnicode_AsUTF8AndSize(unicode, &size) ` + can be used instead. .. versionadded:: 3.3 @@ -1005,6 +1017,16 @@ These are the UTF-8 codec APIs: .. versionchanged:: 3.13 Raise an exception if the string contains embedded null characters. +.. c:function:: const char* PyUnicode_AsUTF8Unsafe(PyObject *unicode) + + Similar to :c:func:`PyUnicode_AsUTF8`, but do not raise an exception if the + string contains embedded null characters. + + This function can be used to truncate a string on purpose at the first null + character. + + .. versionchanged:: 3.13 + UTF-32 Codecs """"""""""""" diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 52d6d967d66327..003f864bd311a1 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -729,6 +729,7 @@ function,PyUnicode_AsUTF32String,3.2,, function,PyUnicode_AsUTF8,3.13,, function,PyUnicode_AsUTF8AndSize,3.10,, function,PyUnicode_AsUTF8String,3.2,, +function,PyUnicode_AsUTF8Unsafe,3.13,, function,PyUnicode_AsUnicodeEscapeString,3.2,, function,PyUnicode_AsWideChar,3.2,, function,PyUnicode_AsWideCharString,3.7,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 5ada880532aad2..c5b676ef32d67a 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1128,6 +1128,11 @@ New Features * Add :c:func:`PyUnicode_AsUTF8` function to the limited C API. (Contributed by Victor Stinner in :gh:`111089`.) +* Add :c:func:`PyUnicode_AsUTF8Unsafe` function: similar to + :c:func:`PyUnicode_AsUTF8`, but do not raise an exception if the string + contains embedded null characters. + (Contributed by Victor Stinner in :gh:`111089`.) + Porting to Python 3.13 ---------------------- @@ -1198,10 +1203,13 @@ Porting to Python 3.13 Note that ``Py_TRASHCAN_BEGIN`` has a second argument which should be the deallocation function it is in. -* The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string - contains embedded null characters. To accept embedded null characters and - truncate on purpose at the first null byte, - ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead. +* The :c:func:`PyUnicode_AsUTF8` and + :c:func:`PyUnicode_AsUTF8AndSize(unicode, NULL) ` + functions now raise an exception if the string contains embedded null + characters. To accept embedded null characters and truncate on purpose at the + first null byte, :c:func:`PyUnicode_AsUTF8Unsafe` and + :c:func:`PyUnicode_AsUTF8AndSize(unicode, &size) ` + can be used instead. (Contributed by Victor Stinner in :gh:`111089`.) * On Windows, ``Python.h`` no longer includes the ```` standard diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index ee7b769ce5a6fc..d40f4093bcff91 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -451,7 +451,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( // This function caches the UTF-8 encoded string in the Unicode object // and subsequent calls will return the same string. The memory is released // when the Unicode object is deallocated. -PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode); +PyAPI_FUNC(const char*) PyUnicode_AsUTF8(PyObject *unicode); + +// Similar to PyUnicode_AsUTF8(), but do not raise an exception if the string +// contains embedded null characters. +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 +PyAPI_FUNC(const char*) PyUnicode_AsUTF8Unsafe(PyObject *unicode); +#endif // Returns a pointer to the UTF-8 encoding of the // Unicode object unicode and the size of the encoded representation diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 63e51eb3ba3fe4..6bfdb42fbf198b 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -875,24 +875,37 @@ def test_fromordinal(self): self.assertRaises(ValueError, fromordinal, 0x110000) self.assertRaises(ValueError, fromordinal, -1) + def check_asutf8(self, unicode_asutf8): + self.assertEqual(unicode_asutf8('abc', 4), b'abc\0') + self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0') + self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0') + + self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0) + self.assertRaises(TypeError, unicode_asutf8, b'abc', 0) + self.assertRaises(TypeError, unicode_asutf8, [], 0) + # CRASHES unicode_asutf8(NULL, 0) + @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_asutf8(self): """Test PyUnicode_AsUTF8()""" from _testcapi import unicode_asutf8 - - self.assertEqual(unicode_asutf8('abc', 4), b'abc\0') - self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0') - self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0') + self.check_asutf8(unicode_asutf8) # disallow embedded null characters self.assertRaises(ValueError, unicode_asutf8, 'abc\0', 0) self.assertRaises(ValueError, unicode_asutf8, 'abc\0def', 0) - self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0) - self.assertRaises(TypeError, unicode_asutf8, b'abc', 0) - self.assertRaises(TypeError, unicode_asutf8, [], 0) - # CRASHES unicode_asutf8(NULL, 0) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_asutf8unsafe(self): + """Test PyUnicode_AsUTF8Unsafe()""" + from _testcapi import unicode_asutf8unsafe + self.check_asutf8(unicode_asutf8unsafe) + + # allow embedded null characters + self.assertEqual(unicode_asutf8unsafe('abc\0', 4), b'abc\0') + self.assertEqual(unicode_asutf8unsafe('abc\0def', 8), b'abc\0def\0') @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -904,9 +917,11 @@ def test_asutf8andsize(self): self.assertEqual(unicode_asutf8andsize('abc', 4), (b'abc\0', 3)) self.assertEqual(unicode_asutf8andsize('абв', 7), (b'\xd0\xb0\xd0\xb1\xd0\xb2\0', 6)) self.assertEqual(unicode_asutf8andsize('\U0001f600', 5), (b'\xf0\x9f\x98\x80\0', 4)) + self.assertEqual(unicode_asutf8andsize('abc\0def', 8), (b'abc\0def\0', 7)) self.assertEqual(unicode_asutf8andsize_null('abc', 4), b'abc\0') - self.assertEqual(unicode_asutf8andsize_null('abc\0def', 8), b'abc\0def\0') + # PyUnicode_AsUTF8AndSize(str, NULL) disallows embedded null characters + self.assertRaises(ValueError, unicode_asutf8andsize_null, 'abc\0def', 8) self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, '\ud8ff', 0) self.assertRaises(TypeError, unicode_asutf8andsize, b'abc', 0) diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 88bc0fd4025a17..fe9f0881b8a525 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -748,6 +748,7 @@ def test_windows_feature_macros(self): "PyUnicode_AsUTF8", "PyUnicode_AsUTF8AndSize", "PyUnicode_AsUTF8String", + "PyUnicode_AsUTF8Unsafe", "PyUnicode_AsUnicodeEscapeString", "PyUnicode_AsWideChar", "PyUnicode_AsWideCharString", diff --git a/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst b/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst index 2008dd5438d2b5..a8a47bdf25009e 100644 --- a/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst +++ b/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst @@ -1,2 +1,4 @@ -The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the -string contains embedded null characters. Patch by Victor Stinner. +The :c:func:`PyUnicode_AsUTF8` and +:c:func:`PyUnicode_AsUTF8AndSize(unicode, NULL) ` +functions now raise an exception if the string contains embedded null +characters. Patch by Victor Stinner. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 0601de20fe0f46..6319d61fcdee7e 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2480,3 +2480,5 @@ added = '3.13' [function.PyUnicode_AsUTF8] added = '3.13' +[function.PyUnicode_AsUTF8Unsafe] + added = '3.13' diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index a10183dddeca98..734e954a522b25 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -619,6 +619,25 @@ unicode_asutf8(PyObject *self, PyObject *args) return PyBytes_FromStringAndSize(s, buflen); } +/* Test PyUnicode_AsUTF8Unsafe() */ +static PyObject * +unicode_asutf8unsafe(PyObject *self, PyObject *args) +{ + PyObject *unicode; + Py_ssize_t buflen; + const char *s; + + if (!PyArg_ParseTuple(args, "On", &unicode, &buflen)) + return NULL; + + NULLABLE(unicode); + s = PyUnicode_AsUTF8Unsafe(unicode); + if (s == NULL) + return NULL; + + return PyBytes_FromStringAndSize(s, buflen); +} + /* Test PyUnicode_AsUTF8AndSize() */ static PyObject * unicode_asutf8andsize(PyObject *self, PyObject *args) @@ -2031,6 +2050,7 @@ static PyMethodDef TestMethods[] = { {"unicode_asucs4copy", unicode_asucs4copy, METH_VARARGS}, {"unicode_fromordinal", unicode_fromordinal, METH_VARARGS}, {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, + {"unicode_asutf8unsafe", unicode_asutf8unsafe, METH_VARARGS}, {"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS}, {"unicode_asutf8andsize_null",unicode_asutf8andsize_null, METH_VARARGS}, {"unicode_getdefaultencoding",unicode_getdefaultencoding, METH_NOARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 87636efcfca050..5404412388b830 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -205,6 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); #endif +static int unicode_fill_utf8(PyObject *unicode); // Return a reference to the immortal empty string singleton. @@ -3813,10 +3814,8 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) } -static int unicode_fill_utf8(PyObject *unicode); - -const char * -PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) +static const char * +unicode_as_utf8(PyObject *unicode, Py_ssize_t *psize, int check_embed_null) { if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); @@ -3826,31 +3825,47 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) return NULL; } - if (PyUnicode_UTF8(unicode) == NULL) { + const char *utf8 = PyUnicode_UTF8(unicode); + if (utf8 == NULL) { if (unicode_fill_utf8(unicode) == -1) { if (psize) { *psize = -1; } return NULL; } + utf8 = PyUnicode_UTF8(unicode); } if (psize) { *psize = PyUnicode_UTF8_LENGTH(unicode); } + + if (check_embed_null) { + if (strlen(utf8) != (size_t)PyUnicode_UTF8_LENGTH(unicode)) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + return NULL; + } + } + return PyUnicode_UTF8(unicode); } -const char * +const char* +PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) +{ + return unicode_as_utf8(unicode, psize, psize == NULL); +} + +const char* PyUnicode_AsUTF8(PyObject *unicode) { - Py_ssize_t size; - const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size); - if (utf8 != NULL && strlen(utf8) != (size_t)size) { - PyErr_SetString(PyExc_ValueError, "embedded null character"); - return NULL; - } - return utf8; + return unicode_as_utf8(unicode, NULL, 1); +} + +const char* +PyUnicode_AsUTF8Unsafe(PyObject *unicode) +{ + return unicode_as_utf8(unicode, NULL, 0); } /* diff --git a/PC/python3dll.c b/PC/python3dll.c index 7f5d97ae4dc83f..7e3741f6e0893e 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -664,6 +664,7 @@ EXPORT_FUNC(PyUnicode_AsUTF32String) EXPORT_FUNC(PyUnicode_AsUTF8) EXPORT_FUNC(PyUnicode_AsUTF8AndSize) EXPORT_FUNC(PyUnicode_AsUTF8String) +EXPORT_FUNC(PyUnicode_AsUTF8Unsafe) EXPORT_FUNC(PyUnicode_AsWideChar) EXPORT_FUNC(PyUnicode_AsWideCharString) EXPORT_FUNC(PyUnicode_BuildEncodingMap)