From c8fc6585856b9be85e989665d956d6ada685983b Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 13 Oct 2025 11:26:19 +0100 Subject: [PATCH 1/3] deprecate non-ascii --- Doc/deprecations/pending-removal-in-3.17.rst | 6 ++++++ Lib/encodings/__init__.py | 8 +++++++- Lib/test/test_codecs.py | 10 +++++++--- .../2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst | 3 +++ 4 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst diff --git a/Doc/deprecations/pending-removal-in-3.17.rst b/Doc/deprecations/pending-removal-in-3.17.rst index 0a1c2f08cab3bd..e769c9d371e133 100644 --- a/Doc/deprecations/pending-removal-in-3.17.rst +++ b/Doc/deprecations/pending-removal-in-3.17.rst @@ -23,6 +23,12 @@ Pending removal in Python 3.17 (Contributed by Shantanu Jain in :gh:`91896`.) +* :mod:`encodings`: + + - Passing non-ascii *encoding* names to :func:`encodings.normalize_encoding` + is deprecated and scheduled for removal in Python 3.17. + (Contributed by Stan Ulbrych in :gh:`136702`) + * :mod:`typing`: - Before Python 3.14, old-style unions were implemented using the private class diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 298177eb8003a7..b048fdc0223b86 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -26,9 +26,10 @@ (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -"""#" +""" import codecs +import warnings import sys from . import aliases @@ -55,6 +56,11 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") + if not encoding.isascii(): + warnings.warn( + "Support for non-ascii encoding names will be removed in 3.17", + DeprecationWarning, stacklevel=2) + chars = [] punct = False for c in encoding: diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index c35a4508943506..f1f0ac5ad36fd2 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3886,15 +3886,14 @@ def search_function(encoding): self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa-8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA---8'), ('test.aaa---8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa---8', 2, 3, 4)) - self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA.8'), ('test.aaa.8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA...8'), ('test.aaa...8', 2, 3, 4)) + with self.assertWarns(DeprecationWarning): + self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) def test_encodings_normalize_encoding(self): - # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') - self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') # encodings.normalize_encoding() doesn't convert # characters to lower case. @@ -3902,6 +3901,11 @@ def test_encodings_normalize_encoding(self): self.assertEqual(normalize('utf.8'), 'utf.8') self.assertEqual(normalize('utf...8'), 'utf...8') + # Non-ASCII *encoding* is deprecated. + with self.assertWarnsRegex(DeprecationWarning, + "Support for non-ascii encoding names will be removed in 3.17"): + self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst new file mode 100644 index 00000000000000..88303f017f58c4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst @@ -0,0 +1,3 @@ +:mod:`encodings`: Deprecate passing a non-ascii *encoding* name to +:func:`encodings.normalize_encoding` and schedule removal of support for +Python 3.17. From 5b50daaddae581499840282c8ba8384d814925f0 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 13 Oct 2025 11:34:12 +0100 Subject: [PATCH 2/3] Relocate import --- Lib/encodings/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index b048fdc0223b86..4a30d786f55881 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -29,7 +29,6 @@ """ import codecs -import warnings import sys from . import aliases @@ -57,6 +56,7 @@ def normalize_encoding(encoding): encoding = str(encoding, "ascii") if not encoding.isascii(): + import warnings warnings.warn( "Support for non-ascii encoding names will be removed in 3.17", DeprecationWarning, stacklevel=2) From 95f2e65dbdee909c88cd8b6276ad9c803c4115cb Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 13 Oct 2025 12:13:40 +0100 Subject: [PATCH 3/3] sanitize charset names in email --- Lib/email/_header_value_parser.py | 1 + Lib/email/utils.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 91243378dc0441..aa81f3554ca74a 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -796,6 +796,7 @@ def params(self): value = urllib.parse.unquote(value, encoding='latin-1') else: try: + charset = utils._sanitize_charset_name(charset, 'us-ascii') value = value.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): # XXX: there should really be a custom defect for diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 3de1f0d24a15b0..67cc3a550b7d9d 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -446,6 +446,15 @@ def decode_params(params): new_params.append((name, '"%s"' % value)) return new_params +def _sanitize_charset_name(charset, fallback_charset): + if not charset: + return charset + sanitized = ''.join( + c for c in charset + if (ord(c) < 0xDC80 or ord(c) > 0xDCFF) and c.isascii() + ) + return sanitized if sanitized else fallback_charset + def collapse_rfc2231_value(value, errors='replace', fallback_charset='us-ascii'): if not isinstance(value, tuple) or len(value) != 3: @@ -458,6 +467,7 @@ def collapse_rfc2231_value(value, errors='replace', # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse # the value, so use the fallback_charset. charset = fallback_charset + charset = _sanitize_charset_name(charset, fallback_charset) rawbytes = bytes(text, 'raw-unicode-escape') try: return str(rawbytes, charset, errors)