From 92873d68937cb68077e01957611969fd26f0bad4 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 09:34:06 +0100 Subject: [PATCH 1/7] C --- Lib/encodings/__init__.py | 33 +++++++++------- Lib/test/test_codecs.py | 6 ++- ...5-07-14-09-33-17.gh-issue-55531.Gt2e12.rst | 4 ++ Modules/_codecsmodule.c | 39 +++++++++++++++++++ Modules/clinic/_codecsmodule.c.h | 31 ++++++++++++++- Objects/unicodeobject.c | 15 +++---- Python/codecs.c | 7 ++-- Python/fileutils.c | 4 +- 8 files changed, 110 insertions(+), 29 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 298177eb8003a7..31ab4147668f26 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -26,7 +26,7 @@ (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -"""#" +""" import codecs import sys @@ -37,10 +37,23 @@ _import_tail = ['*'] _aliases = aliases.aliases + +_norm_encoding_map = ( + #0123456789ABCDEF0123456789ABCDEF + ' ' + ' . 0123456789 ' + ' ABCDEFGHIJKLMNOPQRSTUVWXYZ ' + ' abcdefghijklmnopqrstuvwxyz ' + ' ' + ' ' + ' ' + ' ') + + class CodecRegistryError(LookupError, SystemError): pass -def normalize_encoding(encoding): +def normalize_encoding(encoding, /): """ Normalize an encoding name. @@ -55,18 +68,10 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") - chars = [] - punct = False - for c in encoding: - if c.isalnum() or c == '.': - if punct and chars: - chars.append('_') - if c.isascii(): - chars.append(c) - punct = False - else: - punct = True - return ''.join(chars) + s = encoding.translate(_norm_encoding_map) + return '_'.join(s.split()) + +from _codecs import _normalize_encoding as normalize_encoding def search_function(encoding): diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index d8666f7290e72e..99ea833b60bce6 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3895,11 +3895,13 @@ def search_function(encoding): self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) def test_encodings_normalize_encoding(self): - # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') - self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') + + # encodings.normalize_encoding() does not accept non-ASCII characters. + self.assertRaises(UnicodeEncodeError, normalize, 'utf\xE9\u20AC\U0010ffff-8') + # encodings.normalize_encoding() doesn't convert # characters to lower case. self.assertEqual(normalize('UTF 8'), 'UTF_8') diff --git a/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst new file mode 100644 index 00000000000000..70e39a4f2c167c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst @@ -0,0 +1,4 @@ +:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance +by implementing the function in C using the private +``_Py_normalize_encoding`` which has been modified to make lowercase +conversion optional. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 7cf3f152eeecc6..c8ce3738693bc0 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1022,6 +1022,44 @@ _codecs_lookup_error_impl(PyObject *module, const char *name) return PyCodec_LookupError(name); } +extern int _Py_normalize_encoding(const char *, char *, size_t, int); + +/*[clinic input] +_codecs._normalize_encoding + encoding: str(encoding='ascii') + / + +Normalize an encoding name, while not converting to lower case (to_lower == 1). +Used for encodings.normalize_encoding. +[clinic start generated code]*/ + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, char *encoding) +/*[clinic end generated code: output=d5e3a4b5266fbe96 input=ca002bbc262228f1]*/ +{ + size_t len = strlen(encoding); + if (len > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "encoding is too large"); + return NULL; + } + + char *normalized = PyMem_Malloc(len + 1); + if (normalized == NULL) { + return PyErr_NoMemory(); + } + + if (!_Py_normalize_encoding(encoding, normalized, len + 1, 0)) { + PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); + PyMem_Free(normalized); + return NULL; + } + + PyObject *v = PyUnicode_FromString(normalized); + PyMem_Free(normalized); + return v; +} + + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -1071,6 +1109,7 @@ static PyMethodDef _codecs_functions[] = { _CODECS_REGISTER_ERROR_METHODDEF _CODECS__UNREGISTER_ERROR_METHODDEF _CODECS_LOOKUP_ERROR_METHODDEF + _CODECS__NORMALIZE_ENCODING_METHODDEF {NULL, NULL} /* sentinel */ }; diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index b0310325759326..eb4f481f8118de 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2779,6 +2779,35 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) return return_value; } +PyDoc_STRVAR(_codecs__normalize_encoding__doc__, +"_normalize_encoding($module, encoding, /)\n" +"--\n" +"\n" +"Normalize an encoding name. Used for encodings.normalize_encoding."); + +#define _CODECS__NORMALIZE_ENCODING_METHODDEF \ + {"_normalize_encoding", (PyCFunction)_codecs__normalize_encoding, METH_O, _codecs__normalize_encoding__doc__}, + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, char *encoding); + +static PyObject * +_codecs__normalize_encoding(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + char *encoding = NULL; + + if (!PyArg_Parse(arg, "es:_normalize_encoding", "ascii", &encoding)) { + goto exit; + } + return_value = _codecs__normalize_encoding_impl(module, encoding); + /* Post parse cleanup for encoding */ + PyMem_FREE(encoding); + +exit: + return return_value; +} + #ifndef _CODECS_MBCS_DECODE_METHODDEF #define _CODECS_MBCS_DECODE_METHODDEF #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */ @@ -2802,4 +2831,4 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=aa3636e281f5268f input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c2308a012142a..64d8cf4397237c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3587,13 +3587,14 @@ PyUnicode_FromEncodedObject(PyObject *obj, return v; } -/* Normalize an encoding name: similar to encodings.normalize_encoding(), but - also convert to lowercase. Return 1 on success, or 0 on error (encoding is - longer than lower_len-1). */ +/* Normalize an encoding name like encodings.normalize_encoding() + Optionally covert convert to lowercase by setting *to_lower* to 1. + Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */ int _Py_normalize_encoding(const char *encoding, char *lower, - size_t lower_len) + size_t lower_len, + int to_lower) { const char *e; char *l; @@ -3624,7 +3625,7 @@ _Py_normalize_encoding(const char *encoding, if (l == l_end) { return 0; } - *l++ = Py_TOLOWER(c); + *l++ = to_lower ? Py_TOLOWER(c) : c; } else { punct = 1; @@ -3659,7 +3660,7 @@ PyUnicode_Decode(const char *s, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ @@ -3916,7 +3917,7 @@ PyUnicode_AsEncodedString(PyObject *unicode, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ diff --git a/Python/codecs.c b/Python/codecs.c index caf8d9d5f3c188..ffcb14928e0a82 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -90,7 +90,7 @@ PyCodec_Unregister(PyObject *search_function) return 0; } -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are converted to lower case, spaces and hyphens are replaced with underscores. */ @@ -108,10 +108,11 @@ PyObject *normalizestring(const char *string) } encoding = PyMem_Malloc(len + 1); - if (encoding == NULL) + if (encoding == NULL) { return PyErr_NoMemory(); + } - if (!_Py_normalize_encoding(string, encoding, len + 1)) + if (!_Py_normalize_encoding(string, encoding, len + 1, 1)) { PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); PyMem_Free(encoding); diff --git a/Python/fileutils.c b/Python/fileutils.c index 2a3f12d4e872f8..aedf8576c7a930 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -180,7 +180,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs) #define USE_FORCE_ASCII -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale and POSIX locale. nl_langinfo(CODESET) announces an alias of the @@ -231,7 +231,7 @@ check_force_ascii(void) } char encoding[20]; /* longest name: "iso_646.irv_1991\0" */ - if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) { + if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) { goto error; } From 4bae23a7353f8ec04631dd647a5c51d56baf86c6 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 09:43:10 +0100 Subject: [PATCH 2/7] Correct clinic note --- Modules/_codecsmodule.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index c8ce3738693bc0..d9441ee9b2a74e 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1029,8 +1029,10 @@ _codecs._normalize_encoding encoding: str(encoding='ascii') / -Normalize an encoding name, while not converting to lower case (to_lower == 1). +Normalize an encoding name *encoding*. + Used for encodings.normalize_encoding. +Does not convert to lower case (to_lower == 1). [clinic start generated code]*/ static PyObject * From b5f3df3a44dff71ea9795e12e510fd0de9eafdcb Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 09:54:58 +0100 Subject: [PATCH 3/7] Little fixes --- Lib/encodings/__init__.py | 2 +- Modules/_codecsmodule.c | 6 ++--- Modules/clinic/_codecsmodule.c.h | 42 +++++++++++++++++++++++++++----- Objects/unicodeobject.c | 2 +- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 31ab4147668f26..ef15189b984f40 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -53,7 +53,7 @@ class CodecRegistryError(LookupError, SystemError): pass -def normalize_encoding(encoding, /): +def normalize_encoding(encoding): """ Normalize an encoding name. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index d9441ee9b2a74e..36e58015e84286 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1027,17 +1027,15 @@ extern int _Py_normalize_encoding(const char *, char *, size_t, int); /*[clinic input] _codecs._normalize_encoding encoding: str(encoding='ascii') - / Normalize an encoding name *encoding*. -Used for encodings.normalize_encoding. -Does not convert to lower case (to_lower == 1). +Used for encodings.normalize_encoding. Does not convert to lower case. [clinic start generated code]*/ static PyObject * _codecs__normalize_encoding_impl(PyObject *module, char *encoding) -/*[clinic end generated code: output=d5e3a4b5266fbe96 input=ca002bbc262228f1]*/ +/*[clinic end generated code: output=d5e3a4b5266fbe96 input=cdb53c013b2400e3]*/ { size_t len = strlen(encoding); if (len > PY_SSIZE_T_MAX) { diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index eb4f481f8118de..540c980216dcb6 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2780,24 +2780,54 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) } PyDoc_STRVAR(_codecs__normalize_encoding__doc__, -"_normalize_encoding($module, encoding, /)\n" +"_normalize_encoding($module, /, encoding)\n" "--\n" "\n" -"Normalize an encoding name. Used for encodings.normalize_encoding."); +"Normalize an encoding name *encoding*.\n" +"\n" +"Used for encodings.normalize_encoding. Does not convert to lower case."); #define _CODECS__NORMALIZE_ENCODING_METHODDEF \ - {"_normalize_encoding", (PyCFunction)_codecs__normalize_encoding, METH_O, _codecs__normalize_encoding__doc__}, + {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__}, static PyObject * _codecs__normalize_encoding_impl(PyObject *module, char *encoding); static PyObject * -_codecs__normalize_encoding(PyObject *module, PyObject *arg) +_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(encoding), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"encoding", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .format = "es:_normalize_encoding", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE char *encoding = NULL; - if (!PyArg_Parse(arg, "es:_normalize_encoding", "ascii", &encoding)) { + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + "ascii", &encoding)) { goto exit; } return_value = _codecs__normalize_encoding_impl(module, encoding); @@ -2831,4 +2861,4 @@ _codecs__normalize_encoding(PyObject *module, PyObject *arg) #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=aa3636e281f5268f input=a9049054013a1b77]*/ +/*[clinic end generated code: output=0859b218fa612efd input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 64d8cf4397237c..ba66e273a208be 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3588,7 +3588,7 @@ PyUnicode_FromEncodedObject(PyObject *obj, } /* Normalize an encoding name like encodings.normalize_encoding() - Optionally covert convert to lowercase by setting *to_lower* to 1. + but allow to convert to lowercase if *to_lower* is true. Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */ int _Py_normalize_encoding(const char *encoding, From 2ad72b20dfd463f2f17bea6d4c9284eff6cd39f6 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 10:17:22 +0100 Subject: [PATCH 4/7] Keep the messiness --- Lib/encodings/__init__.py | 18 ++---------------- Modules/_codecsmodule.c | 15 ++++++++++----- Modules/clinic/_codecsmodule.c.h | 21 +++++++++++++-------- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index ef15189b984f40..523b43e2f69cde 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -29,6 +29,7 @@ """ import codecs +from _codecs import _normalize_encoding import sys from . import aliases @@ -38,18 +39,6 @@ _aliases = aliases.aliases -_norm_encoding_map = ( - #0123456789ABCDEF0123456789ABCDEF - ' ' - ' . 0123456789 ' - ' ABCDEFGHIJKLMNOPQRSTUVWXYZ ' - ' abcdefghijklmnopqrstuvwxyz ' - ' ' - ' ' - ' ' - ' ') - - class CodecRegistryError(LookupError, SystemError): pass @@ -68,10 +57,7 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") - s = encoding.translate(_norm_encoding_map) - return '_'.join(s.split()) - -from _codecs import _normalize_encoding as normalize_encoding + return _normalize_encoding(encoding) def search_function(encoding): diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 36e58015e84286..f2ba4eb79650ab 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1026,7 +1026,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t, int); /*[clinic input] _codecs._normalize_encoding - encoding: str(encoding='ascii') + encoding: unicode Normalize an encoding name *encoding*. @@ -1034,10 +1034,15 @@ Used for encodings.normalize_encoding. Does not convert to lower case. [clinic start generated code]*/ static PyObject * -_codecs__normalize_encoding_impl(PyObject *module, char *encoding) -/*[clinic end generated code: output=d5e3a4b5266fbe96 input=cdb53c013b2400e3]*/ +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) +/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/ { - size_t len = strlen(encoding); + const char *cstr = PyUnicode_AsUTF8(encoding); + if (cstr == NULL) { + return NULL; + } + + size_t len = strlen(cstr); if (len > PY_SSIZE_T_MAX) { PyErr_SetString(PyExc_OverflowError, "encoding is too large"); return NULL; @@ -1048,7 +1053,7 @@ _codecs__normalize_encoding_impl(PyObject *module, char *encoding) return PyErr_NoMemory(); } - if (!_Py_normalize_encoding(encoding, normalized, len + 1, 0)) { + if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); PyMem_Free(normalized); return NULL; diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index 540c980216dcb6..9e2a7950ebde64 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2791,7 +2791,7 @@ PyDoc_STRVAR(_codecs__normalize_encoding__doc__, {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__}, static PyObject * -_codecs__normalize_encoding_impl(PyObject *module, char *encoding); +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding); static PyObject * _codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) @@ -2820,19 +2820,24 @@ _codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t static const char * const _keywords[] = {"encoding", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, - .format = "es:_normalize_encoding", + .fname = "_normalize_encoding", .kwtuple = KWTUPLE, }; #undef KWTUPLE - char *encoding = NULL; + PyObject *argsbuf[1]; + PyObject *encoding; - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, - "ascii", &encoding)) { + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]); goto exit; } + encoding = args[0]; return_value = _codecs__normalize_encoding_impl(module, encoding); - /* Post parse cleanup for encoding */ - PyMem_FREE(encoding); exit: return return_value; @@ -2861,4 +2866,4 @@ _codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=0859b218fa612efd input=a9049054013a1b77]*/ +/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/ From 3660160929bd17ad3afae8a00ad805dc1ff93ef6 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 10:29:49 +0100 Subject: [PATCH 5/7] Clean up tests --- Lib/test/test_codecs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 99ea833b60bce6..348b450d1118d2 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3895,13 +3895,12 @@ def search_function(encoding): self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) def test_encodings_normalize_encoding(self): + # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') + self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') - # encodings.normalize_encoding() does not accept non-ASCII characters. - self.assertRaises(UnicodeEncodeError, normalize, 'utf\xE9\u20AC\U0010ffff-8') - # encodings.normalize_encoding() doesn't convert # characters to lower case. self.assertEqual(normalize('UTF 8'), 'UTF_8') From 4e12b9ec888d1d33c9d956e10318847e3a50b58b Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 13:53:48 +0100 Subject: [PATCH 6/7] Remove unnecessary message --- Modules/_codecsmodule.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index f2ba4eb79650ab..1d3534ab98fc47 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1054,7 +1054,6 @@ _codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) } if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { - PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); PyMem_Free(normalized); return NULL; } @@ -1064,7 +1063,6 @@ _codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) return v; } - /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { From 1c9e55ab8ffafd2bb0e68c688fadab90399cfc16 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Tue, 15 Jul 2025 18:02:12 +0100 Subject: [PATCH 7/7] Review --- Lib/encodings/__init__.py | 5 ++--- Lib/test/test_codecs.py | 1 - Modules/_codecsmodule.c | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 523b43e2f69cde..e7e4ca3358e0f9 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -26,11 +26,11 @@ (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -""" +"""#" import codecs -from _codecs import _normalize_encoding import sys +from _codecs import _normalize_encoding from . import aliases _cache = {} @@ -38,7 +38,6 @@ _import_tail = ['*'] _aliases = aliases.aliases - class CodecRegistryError(LookupError, SystemError): pass diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 348b450d1118d2..d8666f7290e72e 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3900,7 +3900,6 @@ def test_encodings_normalize_encoding(self): self.assertEqual(normalize('utf_8'), 'utf_8') self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') - # encodings.normalize_encoding() doesn't convert # characters to lower case. self.assertEqual(normalize('UTF 8'), 'UTF_8') diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 1d3534ab98fc47..853d461ef15950 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1037,30 +1037,41 @@ static PyObject * _codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) /*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/ { - const char *cstr = PyUnicode_AsUTF8(encoding); + Py_ssize_t len; + const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len); if (cstr == NULL) { return NULL; } - size_t len = strlen(cstr); if (len > PY_SSIZE_T_MAX) { PyErr_SetString(PyExc_OverflowError, "encoding is too large"); return NULL; } + PyUnicodeWriter *writer = PyUnicodeWriter_Create(len + 1); + if (writer == NULL) { + return NULL; + } + char *normalized = PyMem_Malloc(len + 1); if (normalized == NULL) { + PyUnicodeWriter_Discard(writer); return PyErr_NoMemory(); } if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { PyMem_Free(normalized); + PyUnicodeWriter_Discard(writer); return NULL; } - PyObject *v = PyUnicode_FromString(normalized); + if (PyUnicodeWriter_WriteUTF8(writer, normalized, (Py_ssize_t)strlen(normalized)) < 0) { + PyUnicodeWriter_Discard(writer); + PyMem_Free(normalized); + return NULL; + } PyMem_Free(normalized); - return v; + return PyUnicodeWriter_Finish(writer); } /* --- Module API --------------------------------------------------------- */