Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,45 @@ def test_syntaxerror_latin1(self):
readline = self.get_readline(lines)
self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)

def test_nonascii_coding(self):
# gh-63161: test non-ASCII coding
tests = [
['iso8859-15',
['#coding=iso8859-15 €'.encode('iso8859-15')]],
['iso8859-15',
[b"#!/usr/bin/python",
'#coding=iso8859-15 €'.encode('iso8859-15')]],
['ascii',
["# nonascii €".encode('utf8'),
'#coding=ascii €'.encode('utf8')]],
['ascii',
['#coding=ascii €'.encode('utf8')]],
]
for encoding, lines in tests:
with self.subTest(encoding=encoding, lines=ascii(lines)):
readline = self.get_readline(lines)
found, consumed_lines = tokenize.detect_encoding(readline)
self.assertEqual(found, encoding)

lines = ["# nonascii €".encode('iso8859-15'),
'#coding=iso8859-15 €'.encode('iso8859-15')]
readline = self.get_readline(lines)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(readline)

def test_nonascii(self):
# gh-63161: test non-ASCII header with no coding marker
lines = ["# nonascii line 1 €".encode('utf8'),
'# nonascii line 2 €'.encode('utf8')]
readline = self.get_readline(lines)
found, consumed_lines = tokenize.detect_encoding(readline)
self.assertEqual(found, "utf-8")

lines = ["# nonascii line 1 €".encode('iso8859-15'),
'# nonascii line 2 €'.encode('iso8859-15')]
readline = self.get_readline(lines)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(readline)

def test_utf8_normalization(self):
# See get_normal_name() in Parser/tokenizer/helpers.c.
Expand Down
24 changes: 14 additions & 10 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,20 +386,24 @@ def read_or_stop():
return b''

def find_cookie(line):
try:
# Decode as UTF-8. Either the line is an encoding declaration,
# in which case it should be pure ASCII, or it must be UTF-8
# per default encoding.
line_string = line.decode('utf-8')
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
# gh-63161: Use surrogateescape error handler to escape potential
# non-ASCII characters after the coding declaration.
line_string = line.decode('utf-8', 'surrogateescape')

match = cookie_re.match(line_string)
if not match:
try:
# Decode as UTF-8. Either the line is an encoding declaration,
# in which case it should be pure ASCII, or it must be UTF-8
# per default encoding.
line.decode('utf-8')
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
return None

encoding = _get_normal_name(match.group(1))
try:
codec = lookup(encoding)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix :func:`tokenize.detect_encoding` for non-ASCII coding. Patch by Victor
Stinner.
Loading