Skip to content

Commit ea5bfdd

Browse files
gh-63161: Add more tests for source encoding
1 parent 408154d commit ea5bfdd

File tree

2 files changed

+178
-20
lines changed

2 files changed

+178
-20
lines changed

Lib/test/test_source_encoding.py

Lines changed: 96 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ def test_tokenizer_fstring_warning_in_first_line(self):
172172
os.unlink(TESTFN)
173173

174174

175+
BUFSIZ = 2**13
176+
175177
class AbstractSourceEncodingTest:
176178

177179
def test_default_coding(self):
@@ -184,14 +186,20 @@ def test_first_coding_line(self):
184186
self.check_script_output(src, br"'\xc3\u20ac'")
185187

186188
def test_second_coding_line(self):
187-
src = (b'#\n'
189+
src = (b'#!/usr/bin/python\n'
190+
b'#coding:iso8859-15\n'
191+
b'print(ascii("\xc3\xa4"))\n')
192+
self.check_script_output(src, br"'\xc3\u20ac'")
193+
194+
def test_second_coding_line_empty_first_line(self):
195+
src = (b'\n'
188196
b'#coding:iso8859-15\n'
189197
b'print(ascii("\xc3\xa4"))\n')
190198
self.check_script_output(src, br"'\xc3\u20ac'")
191199

192200
def test_third_coding_line(self):
193201
# Only first two lines are tested for a magic comment.
194-
src = (b'#\n'
202+
src = (b'#!/usr/bin/python\n'
195203
b'#\n'
196204
b'#coding:iso8859-15\n'
197205
b'print(ascii("\xc3\xa4"))\n')
@@ -209,13 +217,52 @@ def test_double_coding_same_line(self):
209217
b'print(ascii("\xc3\xa4"))\n')
210218
self.check_script_output(src, br"'\xc3\u20ac'")
211219

220+
def test_double_coding_utf8(self):
221+
src = (b'#coding:utf-8\n'
222+
b'#coding:latin1\n'
223+
b'print(ascii("\xc3\xa4"))\n')
224+
self.check_script_output(src, br"'\xe4'")
225+
226+
def test_long_first_coding_line(self):
227+
src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
228+
b'print(ascii("\xc3\xa4"))\n')
229+
self.check_script_output(src, br"'\xc3\u20ac'")
230+
231+
def test_long_second_coding_line(self):
232+
src = (b'#!/usr/bin/python\n'
233+
b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
234+
b'print(ascii("\xc3\xa4"))\n')
235+
self.check_script_output(src, br"'\xc3\u20ac'")
236+
237+
def test_long_coding_line(self):
238+
src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n'
239+
b'print(ascii("\xc3\xa4"))\n')
240+
self.check_script_output(src, br"'\xc3\u20ac'")
241+
242+
def test_long_coding_name(self):
243+
src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n'
244+
b'print(ascii("\xc3\xa4"))\n')
245+
self.check_script_output(src, br"'\xc3\xa4'")
246+
247+
def test_long_first_utf8_line(self):
248+
src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
249+
self.check_script_output(src, b'')
250+
src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
251+
self.check_script_output(src, b'')
252+
253+
def test_long_second_utf8_line(self):
254+
src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
255+
self.check_script_output(src, b'')
256+
src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
257+
self.check_script_output(src, b'')
258+
212259
def test_first_non_utf8_coding_line(self):
213260
src = (b'#coding:iso-8859-15 \xa4\n'
214261
b'print(ascii("\xc3\xa4"))\n')
215262
self.check_script_output(src, br"'\xc3\u20ac'")
216263

217264
def test_second_non_utf8_coding_line(self):
218-
src = (b'\n'
265+
src = (b'#!/usr/bin/python\n'
219266
b'#coding:iso-8859-15 \xa4\n'
220267
b'print(ascii("\xc3\xa4"))\n')
221268
self.check_script_output(src, br"'\xc3\u20ac'")
@@ -224,27 +271,56 @@ def test_utf8_bom(self):
224271
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
225272
self.check_script_output(src, br"'\xe4'")
226273

274+
def test_utf8_bom_utf8_comments(self):
275+
src = (b'\xef\xbb\xbf#\xc3\xa4\n'
276+
b'#\xc3\xa4\n'
277+
b'print(ascii("\xc3\xa4"))\n')
278+
self.check_script_output(src, br"'\xe4'")
279+
227280
def test_utf8_bom_and_utf8_coding_line(self):
228281
src = (b'\xef\xbb\xbf#coding:utf-8\n'
229282
b'print(ascii("\xc3\xa4"))\n')
230283
self.check_script_output(src, br"'\xe4'")
231284

285+
def test_utf8_non_utf8_comment_line_error(self):
286+
src = (b'#coding: utf8\n'
287+
b'#\n'
288+
b'#\xa4\n'
289+
b'raise RuntimeError\n')
290+
self.check_script_error(src,
291+
br"'utf-8' codec can't decode byte|"
292+
br"encoding problem: utf8")
293+
232294
def test_crlf(self):
233295
src = (b'print(ascii("""\r\n"""))\n')
234-
out = self.check_script_output(src, br"'\n'")
296+
self.check_script_output(src, br"'\n'")
235297

236298
def test_crcrlf(self):
237299
src = (b'print(ascii("""\r\r\n"""))\n')
238-
out = self.check_script_output(src, br"'\n\n'")
300+
self.check_script_output(src, br"'\n\n'")
239301

240302
def test_crcrcrlf(self):
241303
src = (b'print(ascii("""\r\r\r\n"""))\n')
242-
out = self.check_script_output(src, br"'\n\n\n'")
304+
self.check_script_output(src, br"'\n\n\n'")
243305

244306
def test_crcrcrlf2(self):
245307
src = (b'#coding:iso-8859-1\n'
246308
b'print(ascii("""\r\r\r\n"""))\n')
247-
out = self.check_script_output(src, br"'\n\n\n'")
309+
self.check_script_output(src, br"'\n\n\n'")
310+
311+
def test_nul_in_first_coding_line(self):
312+
src = (b'#coding:iso8859-15\x00\n'
313+
b'\n'
314+
b'\n'
315+
b'raise RuntimeError\n')
316+
self.check_script_error(src, br"source code (string )?cannot contain null bytes")
317+
318+
def test_nul_in_second_coding_line(self):
319+
src = (b'#!/usr/bin/python\n'
320+
b'#coding:iso8859-15\x00\n'
321+
b'\n'
322+
b'raise RuntimeError\n')
323+
self.check_script_error(src, br"source code (string )?cannot contain null bytes")
248324

249325

250326
class UTF8ValidatorTest(unittest.TestCase):
@@ -324,6 +400,11 @@ def check_script_output(self, src, expected):
324400
out = stdout.getvalue().encode('latin1')
325401
self.assertEqual(out.rstrip(), expected)
326402

403+
def check_script_error(self, src, expected):
404+
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
405+
exec(src)
406+
# self.assertEqual(str(cm.exception).encode(), expected)
407+
327408

328409
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
329410

@@ -335,6 +416,14 @@ def check_script_output(self, src, expected):
335416
res = script_helper.assert_python_ok(fn)
336417
self.assertEqual(res.out.rstrip(), expected)
337418

419+
def check_script_error(self, src, expected):
420+
with tempfile.TemporaryDirectory() as tmpd:
421+
fn = os.path.join(tmpd, 'test.py')
422+
with open(fn, 'wb') as fp:
423+
fp.write(src)
424+
res = script_helper.assert_python_failure(fn)
425+
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError: ' + expected)
426+
338427

339428
if __name__ == "__main__":
340429
unittest.main()

Lib/test/test_tokenize.py

Lines changed: 82 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,24 +1346,63 @@ def readline():
13461346

13471347
def test_no_bom_no_encoding_cookie(self):
13481348
lines = (
1349-
b'# something\n',
1349+
b'#!/home/\xc3\xa4/bin/python\n',
1350+
b'# something \xe2\x82\xac\n',
13501351
b'print(something)\n',
13511352
b'do_something(else)\n'
13521353
)
13531354
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
13541355
self.assertEqual(encoding, 'utf-8')
13551356
self.assertEqual(consumed_lines, list(lines[:2]))
13561357

1358+
def test_no_bom_no_encoding_cookie_first_line_error(self):
1359+
lines = (
1360+
b'#!/home/\xa4/bin/python\n\n',
1361+
b'print(something)\n',
1362+
b'do_something(else)\n'
1363+
)
1364+
with self.assertRaises(SyntaxError):
1365+
tokenize.detect_encoding(self.get_readline(lines))
1366+
1367+
def test_no_bom_no_encoding_cookie_second_line_error(self):
1368+
lines = (
1369+
b'#!/usr/bin/python\n',
1370+
b'# something \xe2\n',
1371+
b'print(something)\n',
1372+
b'do_something(else)\n'
1373+
)
1374+
with self.assertRaises(SyntaxError):
1375+
tokenize.detect_encoding(self.get_readline(lines))
1376+
13571377
def test_bom_no_cookie(self):
13581378
lines = (
1359-
b'\xef\xbb\xbf# something\n',
1379+
b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n',
13601380
b'print(something)\n',
13611381
b'do_something(else)\n'
13621382
)
13631383
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
13641384
self.assertEqual(encoding, 'utf-8-sig')
13651385
self.assertEqual(consumed_lines,
1366-
[b'# something\n', b'print(something)\n'])
1386+
[b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n'])
1387+
1388+
def test_bom_no_cookie_first_line_error(self):
1389+
lines = (
1390+
b'\xef\xbb\xbf#!/home/\xa4/bin/python\n',
1391+
b'print(something)\n',
1392+
b'do_something(else)\n'
1393+
)
1394+
with self.assertRaises(SyntaxError):
1395+
tokenize.detect_encoding(self.get_readline(lines))
1396+
1397+
def test_bom_no_cookie_second_line_error(self):
1398+
lines = (
1399+
b'\xef\xbb\xbf#!/usr/bin/python\n',
1400+
b'# something \xe2\n',
1401+
b'print(something)\n',
1402+
b'do_something(else)\n'
1403+
)
1404+
with self.assertRaises(SyntaxError):
1405+
tokenize.detect_encoding(self.get_readline(lines))
13671406

13681407
def test_cookie_first_line_no_bom(self):
13691408
lines = (
@@ -1439,27 +1478,58 @@ def test_cookie_second_line_noncommented_first_line(self):
14391478
expected = [b"print('\xc2\xa3')\n"]
14401479
self.assertEqual(consumed_lines, expected)
14411480

1442-
def test_cookie_second_line_commented_first_line(self):
1481+
def test_cookie_second_line_empty_first_line(self):
14431482
lines = (
1444-
b"#print('\xc2\xa3')\n",
1483+
b'\n',
14451484
b'# vim: set fileencoding=iso8859-15 :\n',
14461485
b"print('\xe2\x82\xac')\n"
14471486
)
14481487
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
14491488
self.assertEqual(encoding, 'iso8859-15')
1450-
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1489+
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
14511490
self.assertEqual(consumed_lines, expected)
14521491

1453-
def test_cookie_second_line_empty_first_line(self):
1492+
def test_cookie_third_line(self):
14541493
lines = (
1455-
b'\n',
1456-
b'# vim: set fileencoding=iso8859-15 :\n',
1457-
b"print('\xe2\x82\xac')\n"
1494+
b'#!/home/\xc3\xa4/bin/python\n',
1495+
b'# something\n',
1496+
b'# vim: set fileencoding=ascii :\n',
1497+
b'print(something)\n',
1498+
b'do_something(else)\n'
1499+
)
1500+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1501+
self.assertEqual(encoding, 'utf-8')
1502+
self.assertEqual(consumed_lines, list(lines[:2]))
1503+
1504+
def test_double_coding_line(self):
1505+
# If the first line matches the second line is ignored.
1506+
lines = (
1507+
b'#coding:iso8859-15\n',
1508+
b'#coding:latin1\n',
1509+
b'print(something)\n'
14581510
)
14591511
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
14601512
self.assertEqual(encoding, 'iso8859-15')
1461-
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1462-
self.assertEqual(consumed_lines, expected)
1513+
self.assertEqual(consumed_lines, list(lines[:1]))
1514+
1515+
def test_double_coding_same_line(self):
1516+
lines = (
1517+
b'#coding:iso8859-15 coding:latin1\n',
1518+
b'print(something)\n'
1519+
)
1520+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1521+
self.assertEqual(encoding, 'iso8859-15')
1522+
self.assertEqual(consumed_lines, list(lines[:1]))
1523+
1524+
def test_double_coding_utf8(self):
1525+
lines = (
1526+
b'#coding:utf-8\n',
1527+
b'#coding:latin1\n',
1528+
b'print(something)\n'
1529+
)
1530+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1531+
self.assertEqual(encoding, 'utf-8')
1532+
self.assertEqual(consumed_lines, list(lines[:1]))
14631533

14641534
def test_latin1_normalization(self):
14651535
# See get_normal_name() in Parser/tokenizer/helpers.c.
@@ -1485,7 +1555,6 @@ def test_syntaxerror_latin1(self):
14851555
readline = self.get_readline(lines)
14861556
self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
14871557

1488-
14891558
def test_utf8_normalization(self):
14901559
# See get_normal_name() in Parser/tokenizer/helpers.c.
14911560
encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

0 commit comments

Comments
 (0)