diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 865e0c5b40ddd3..12bc83e914e688 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1485,6 +1485,45 @@ def test_syntaxerror_latin1(self): readline = self.get_readline(lines) self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) + def test_nonascii_coding(self): + # gh-63161: test non-ASCII coding + tests = [ + ['iso8859-15', + ['#coding=iso8859-15 €'.encode('iso8859-15')]], + ['iso8859-15', + [b"#!/usr/bin/python", + '#coding=iso8859-15 €'.encode('iso8859-15')]], + ['ascii', + ["# nonascii €".encode('utf8'), + '#coding=ascii €'.encode('utf8')]], + ['ascii', + ['#coding=ascii €'.encode('utf8')]], + ] + for encoding, lines in tests: + with self.subTest(encoding=encoding, lines=ascii(lines)): + readline = self.get_readline(lines) + found, consumed_lines = tokenize.detect_encoding(readline) + self.assertEqual(found, encoding) + + lines = ["# nonascii €".encode('iso8859-15'), + '#coding=iso8859-15 €'.encode('iso8859-15')] + readline = self.get_readline(lines) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(readline) + + def test_nonascii(self): + # gh-63161: test non-ASCII header with no coding marker + lines = ["# nonascii line 1 €".encode('utf8'), + '# nonascii line 2 €'.encode('utf8')] + readline = self.get_readline(lines) + found, consumed_lines = tokenize.detect_encoding(readline) + self.assertEqual(found, "utf-8") + + lines = ["# nonascii line 1 €".encode('iso8859-15'), + '# nonascii line 2 €'.encode('iso8859-15')] + readline = self.get_readline(lines) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(readline) def test_utf8_normalization(self): # See get_normal_name() in Parser/tokenizer/helpers.c. diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 7e71755068e1df..d8eeed9ce1d956 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -386,20 +386,24 @@ def read_or_stop(): return b'' def find_cookie(line): - try: - # Decode as UTF-8. Either the line is an encoding declaration, - # in which case it should be pure ASCII, or it must be UTF-8 - # per default encoding. - line_string = line.decode('utf-8') - except UnicodeDecodeError: - msg = "invalid or missing encoding declaration" - if filename is not None: - msg = '{} for {!r}'.format(msg, filename) - raise SyntaxError(msg) + # gh-63161: Use surrogateescape error handler to escape potential + # non-ASCII characters after the coding declaration. + line_string = line.decode('utf-8', 'surrogateescape') match = cookie_re.match(line_string) if not match: + try: + # Decode as UTF-8. Either the line is an encoding declaration, + # in which case it should be pure ASCII, or it must be UTF-8 + # per default encoding. + line.decode('utf-8') + except UnicodeDecodeError: + msg = "invalid or missing encoding declaration" + if filename is not None: + msg = '{} for {!r}'.format(msg, filename) + raise SyntaxError(msg) return None + encoding = _get_normal_name(match.group(1)) try: codec = lookup(encoding) diff --git a/Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst b/Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst new file mode 100644 index 00000000000000..e0709775b0dd05 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst @@ -0,0 +1,2 @@ +Fix :func:`tokenize.detect_encoding` for non-ASCII coding. Patch by Victor +Stinner.