python · vstinner · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 23, 2025
@@ -1485,6 +1485,45 @@ def test_syntaxerror_latin1(self):
         readline = self.get_readline(lines)
         self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
 
+    def test_nonascii_coding(self):
+        # gh-63161: test non-ASCII coding
+        tests = [
+            ['iso8859-15',
+             ['#coding=iso8859-15 €'.encode('iso8859-15')]],
+            ['iso8859-15',
+             [b"#!/usr/bin/python",
+              '#coding=iso8859-15 €'.encode('iso8859-15')]],
+            ['ascii',
+             ["# nonascii €".encode('utf8'),
+              '#coding=ascii €'.encode('utf8')]],
+            ['ascii',
+             ['#coding=ascii €'.encode('utf8')]],
+        ]
+        for encoding, lines in tests:
+            with self.subTest(encoding=encoding, lines=ascii(lines)):
+                readline = self.get_readline(lines)
+                found, consumed_lines = tokenize.detect_encoding(readline)
+                self.assertEqual(found, encoding)
+
+        lines = ["# nonascii €".encode('iso8859-15'),
+                 '#coding=iso8859-15 €'.encode('iso8859-15')]
+        readline = self.get_readline(lines)
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(readline)
+
+    def test_nonascii(self):
+        # gh-63161: test non-ASCII header with no coding marker
+        lines = ["# nonascii line 1 €".encode('utf8'),
+                 '# nonascii line 2 €'.encode('utf8')]
+        readline = self.get_readline(lines)
+        found, consumed_lines = tokenize.detect_encoding(readline)
+        self.assertEqual(found, "utf-8")
+
+        lines = ["# nonascii line 1 €".encode('iso8859-15'),
+                 '# nonascii line 2 €'.encode('iso8859-15')]
+        readline = self.get_readline(lines)
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(readline)
 
     def test_utf8_normalization(self):
         # See get_normal_name() in Parser/tokenizer/helpers.c.

@@ -386,20 +386,24 @@ def read_or_stop():
             return b''
 
     def find_cookie(line):
-        try:
-            # Decode as UTF-8. Either the line is an encoding declaration,
-            # in which case it should be pure ASCII, or it must be UTF-8
-            # per default encoding.
-            line_string = line.decode('utf-8')
-        except UnicodeDecodeError:
-            msg = "invalid or missing encoding declaration"
-            if filename is not None:
-                msg = '{} for {!r}'.format(msg, filename)
-            raise SyntaxError(msg)
+        # gh-63161: Use surrogateescape error handler to escape potential
+        # non-ASCII characters after the coding declaration.
+        line_string = line.decode('utf-8', 'surrogateescape')
 
         match = cookie_re.match(line_string)
         if not match:
+            try:
+                # Decode as UTF-8. Either the line is an encoding declaration,
+                # in which case it should be pure ASCII, or it must be UTF-8
+                # per default encoding.
+                line.decode('utf-8')
+            except UnicodeDecodeError:
+                msg = "invalid or missing encoding declaration"
+                if filename is not None:
+                    msg = '{} for {!r}'.format(msg, filename)
+                raise SyntaxError(msg)
             return None
+
         encoding = _get_normal_name(match.group(1))
         try:
             codec = lookup(encoding)

diff --git a/Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst b/Misc/NEWS.d/next/Library/2025-09-22-15-07-33.gh-issue-63161.1f6k5q.rst
@@ -0,0 +1,2 @@
+Fix :func:`tokenize.detect_encoding` for non-ASCII coding. Patch by Victor
+Stinner.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Fix :func:`tokenize.detect_encoding` for non-ASCII coding. Patch by Victor
		Stinner.