From fb7b944f99c397175b7cd7064263707d825a6171 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 22 Sep 2025 15:02:45 +0200 Subject: [PATCH 1/5] gh-63161: Fix tokenize detect_encoding() for non-ASCII coding --- Lib/test/test_tokenize.py | 8 ++++++++ Lib/tokenize.py | 24 ++++++++++++++---------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 865e0c5b40ddd3..c769c5b8e203b9 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1485,6 +1485,14 @@ def test_syntaxerror_latin1(self): readline = self.get_readline(lines) self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) + def test_nonascii_coding(self): + # gh-63161: test non-ASCII coding + lines = ( + '#coding=iso8859-15 €'.encode('iso8859-15'), + ) + readline = self.get_readline(lines) + found, consumed_lines = tokenize.detect_encoding(readline) + self.assertEqual(found, "iso8859-15") def test_utf8_normalization(self): # See get_normal_name() in Parser/tokenizer/helpers.c. diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 7e71755068e1df..d8eeed9ce1d956 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -386,20 +386,24 @@ def read_or_stop(): return b'' def find_cookie(line): - try: - # Decode as UTF-8. Either the line is an encoding declaration, - # in which case it should be pure ASCII, or it must be UTF-8 - # per default encoding. - line_string = line.decode('utf-8') - except UnicodeDecodeError: - msg = "invalid or missing encoding declaration" - if filename is not None: - msg = '{} for {!r}'.format(msg, filename) - raise SyntaxError(msg) + # gh-63161: Use surrogateescape error handler to escape potential + # non-ASCII characters after the coding declaration. + line_string = line.decode('utf-8', 'surrogateescape') match = cookie_re.match(line_string) if not match: + try: + # Decode as UTF-8. Either the line is an encoding declaration, + # in which case it should be pure ASCII, or it must be UTF-8 + # per default encoding. + line.decode('utf-8') + except UnicodeDecodeError: + msg = "invalid or missing encoding declaration" + if filename is not None: + msg = '{} for {!r}'.format(msg, filename) + raise SyntaxError(msg) return None + encoding = _get_normal_name(match.group(1)) try: codec = lookup(encoding) From c535b65f92a717d79a784056439b7b62216562d9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 22 Sep 2025 15:07:36 +0200 Subject: [PATCH 2/5] Add NEWS entry --- .../next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst diff --git a/Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst b/Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst new file mode 100644 index 00000000000000..e0709775b0dd05 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst @@ -0,0 +1,2 @@ +Fix :func:`tokenize.detect_encoding` for non-ASCII coding. Patch by Victor +Stinner. From 5723fc57c5e514130307f26647ebb7fe747379f0 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 22 Sep 2025 16:17:42 +0200 Subject: [PATCH 3/5] Add tests --- Lib/test/test_tokenize.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index c769c5b8e203b9..7496ec27728df3 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1487,12 +1487,23 @@ def test_syntaxerror_latin1(self): def test_nonascii_coding(self): # gh-63161: test non-ASCII coding - lines = ( - '#coding=iso8859-15 €'.encode('iso8859-15'), - ) - readline = self.get_readline(lines) - found, consumed_lines = tokenize.detect_encoding(readline) - self.assertEqual(found, "iso8859-15") + tests = [ + ['iso8859-15', + ['#coding=iso8859-15 €'.encode('iso8859-15')]], + ['iso8859-15', + [b"#!/usr/bin/python\n", + '#coding=iso8859-15 €'.encode('iso8859-15')]], + ['ascii', + [b"#!/usr/bin/python\n", + '#coding=ascii €'.encode('utf8')]], + ['ascii', + ['#coding=ascii €'.encode('utf8')]], + ] + for encoding, lines in tests: + with self.subTest(encoding=encoding, lines=ascii(lines)): + readline = self.get_readline(lines) + found, consumed_lines = tokenize.detect_encoding(readline) + self.assertEqual(found, encoding) def test_utf8_normalization(self): # See get_normal_name() in Parser/tokenizer/helpers.c. From 911dc3a4de61a6c05ce2f004519304d1a397e06f Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 23 Sep 2025 16:23:05 +0200 Subject: [PATCH 4/5] Add more tests --- Lib/test/test_tokenize.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 7496ec27728df3..d1a5dcc7551afe 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1491,10 +1491,10 @@ def test_nonascii_coding(self): ['iso8859-15', ['#coding=iso8859-15 €'.encode('iso8859-15')]], ['iso8859-15', - [b"#!/usr/bin/python\n", + [b"#!/usr/bin/python", '#coding=iso8859-15 €'.encode('iso8859-15')]], ['ascii', - [b"#!/usr/bin/python\n", + ["# nonascii €".encode('utf8'), '#coding=ascii €'.encode('utf8')]], ['ascii', ['#coding=ascii €'.encode('utf8')]], @@ -1505,6 +1505,12 @@ def test_nonascii_coding(self): found, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(found, encoding) + lines = ["# nonascii €".encode('iso8859-15'), + '#coding=iso8859-15 €'.encode('iso8859-15')] + readline = self.get_readline(lines) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(readline) + def test_utf8_normalization(self): # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix") From e36d860f7717f68ddbfa72a423a54a8a0e2cc820 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 23 Sep 2025 16:28:08 +0200 Subject: [PATCH 5/5] Test comments with no coding marker --- Lib/test/test_tokenize.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index d1a5dcc7551afe..12bc83e914e688 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1511,6 +1511,20 @@ def test_nonascii_coding(self): with self.assertRaises(SyntaxError): tokenize.detect_encoding(readline) + def test_nonascii(self): + # gh-63161: test non-ASCII header with no coding marker + lines = ["# nonascii line 1 €".encode('utf8'), + '# nonascii line 2 €'.encode('utf8')] + readline = self.get_readline(lines) + found, consumed_lines = tokenize.detect_encoding(readline) + self.assertEqual(found, "utf-8") + + lines = ["# nonascii line 1 €".encode('iso8859-15'), + '# nonascii line 2 €'.encode('iso8859-15')] + readline = self.get_readline(lines) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(readline) + def test_utf8_normalization(self): # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix")