diff --git a/Doc/deprecations/pending-removal-in-3.17.rst b/Doc/deprecations/pending-removal-in-3.17.rst index 0a1c2f08cab3bd..e769c9d371e133 100644 --- a/Doc/deprecations/pending-removal-in-3.17.rst +++ b/Doc/deprecations/pending-removal-in-3.17.rst @@ -23,6 +23,12 @@ Pending removal in Python 3.17 (Contributed by Shantanu Jain in :gh:`91896`.) +* :mod:`encodings`: + + - Passing non-ascii *encoding* names to :func:`encodings.normalize_encoding` + is deprecated and scheduled for removal in Python 3.17. + (Contributed by Stan Ulbrych in :gh:`136702`) + * :mod:`typing`: - Before Python 3.14, old-style unions were implemented using the private class diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 91243378dc0441..aa81f3554ca74a 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -796,6 +796,7 @@ def params(self): value = urllib.parse.unquote(value, encoding='latin-1') else: try: + charset = utils._sanitize_charset_name(charset, 'us-ascii') value = value.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): # XXX: there should really be a custom defect for diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 3de1f0d24a15b0..67cc3a550b7d9d 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -446,6 +446,15 @@ def decode_params(params): new_params.append((name, '"%s"' % value)) return new_params +def _sanitize_charset_name(charset, fallback_charset): + if not charset: + return charset + sanitized = ''.join( + c for c in charset + if (ord(c) < 0xDC80 or ord(c) > 0xDCFF) and c.isascii() + ) + return sanitized if sanitized else fallback_charset + def collapse_rfc2231_value(value, errors='replace', fallback_charset='us-ascii'): if not isinstance(value, tuple) or len(value) != 3: @@ -458,6 +467,7 @@ def collapse_rfc2231_value(value, errors='replace', # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse # the value, so use the fallback_charset. charset = fallback_charset + charset = _sanitize_charset_name(charset, fallback_charset) rawbytes = bytes(text, 'raw-unicode-escape') try: return str(rawbytes, charset, errors) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 298177eb8003a7..4a30d786f55881 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -26,7 +26,7 @@ (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -"""#" +""" import codecs import sys @@ -55,6 +55,12 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") + if not encoding.isascii(): + import warnings + warnings.warn( + "Support for non-ascii encoding names will be removed in 3.17", + DeprecationWarning, stacklevel=2) + chars = [] punct = False for c in encoding: diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index c35a4508943506..f1f0ac5ad36fd2 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3886,15 +3886,14 @@ def search_function(encoding): self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa-8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA---8'), ('test.aaa---8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa---8', 2, 3, 4)) - self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA.8'), ('test.aaa.8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA...8'), ('test.aaa...8', 2, 3, 4)) + with self.assertWarns(DeprecationWarning): + self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) def test_encodings_normalize_encoding(self): - # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') - self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') # encodings.normalize_encoding() doesn't convert # characters to lower case. @@ -3902,6 +3901,11 @@ def test_encodings_normalize_encoding(self): self.assertEqual(normalize('utf.8'), 'utf.8') self.assertEqual(normalize('utf...8'), 'utf...8') + # Non-ASCII *encoding* is deprecated. + with self.assertWarnsRegex(DeprecationWarning, + "Support for non-ascii encoding names will be removed in 3.17"): + self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst new file mode 100644 index 00000000000000..88303f017f58c4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst @@ -0,0 +1,3 @@ +:mod:`encodings`: Deprecate passing a non-ascii *encoding* name to +:func:`encodings.normalize_encoding` and schedule removal of support for +Python 3.17.