Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Lib/test/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ def check(self, src, lineno, offset, end_lineno=None, end_offset=None, encoding=
if not isinstance(src, str):
src = src.decode(encoding, 'replace')
line = src.split('\n')[lineno-1]
if lineno == 1:
line = line.removeprefix('\ufeff')
self.assertIn(line, cm.exception.text)

def test_error_offset_continuation_characters(self):
Expand All @@ -239,7 +241,9 @@ def testSyntaxErrorOffset(self):
check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20)
check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
2, 19, encoding='cp1251')
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 10)
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +', 4, 12)
check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
check('x = "a', 1, 5)
check('lambda x: x = 2', 1, 1)
check('f{a + b + c}', 1, 2)
Expand Down Expand Up @@ -287,7 +291,7 @@ def baz():
check("pass\npass\npass\n(1+)\npass\npass\npass", 4, 4)
check("(1+)", 1, 4)
check("[interesting\nfoo()\n", 1, 1)
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 0, -1)
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 1, 0)
check("""f'''
{
(123_a)
Expand Down
123 changes: 112 additions & 11 deletions Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# -*- coding: utf-8 -*-

import unittest
from test.support import script_helper, captured_stdout, requires_subprocess, requires_resource
from test import support
from test.support import script_helper
from test.support.os_helper import TESTFN, unlink, rmtree
from test.support.import_helper import unload
import importlib
Expand Down Expand Up @@ -64,7 +65,7 @@ def test_issue7820(self):
# two bytes in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')

@requires_subprocess()
@support.requires_subprocess()
def test_20731(self):
sub = subprocess.Popen([sys.executable,
os.path.join(os.path.dirname(__file__),
Expand Down Expand Up @@ -267,6 +268,17 @@ def test_second_non_utf8_coding_line(self):
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_first_utf8_coding_line_error(self):
src = (b'#coding:ascii \xc3\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")

def test_second_utf8_coding_line_error(self):
src = (b'#!/usr/bin/python\n'
b'#coding:ascii \xc3\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")

def test_utf8_bom(self):
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")
Expand All @@ -282,10 +294,80 @@ def test_utf8_bom_and_utf8_coding_line(self):
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")

def test_utf8_non_utf8_comment_line_error(self):
def test_utf8_bom_and_non_utf8_first_coding_line(self):
src = (b'\xef\xbb\xbf#coding:iso-8859-15\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"encoding problem: iso-8859-15 with BOM",
lineno=1)

def test_utf8_bom_and_non_utf8_second_coding_line(self):
src = (b'\xef\xbb\xbf#first\n'
b'#coding:iso-8859-15\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"encoding problem: iso-8859-15 with BOM",
lineno=2)

def test_non_utf8_shebang(self):
src = (b'#!/home/\xa4/bin/python\n'
b'#coding:iso-8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_utf8_shebang_error(self):
src = (b'#!/home/\xc3\xa4/bin/python\n'
b'#coding:ascii\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")

def test_non_utf8_shebang_error(self):
src = (b'#!/home/\xa4/bin/python\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1",
lineno=1)

def test_non_utf8_second_line_error(self):
src = (b'#first\n'
b'#second\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 2",
lineno=2)

def test_non_utf8_third_line_error(self):
src = (b'#first\n'
b'#second\n'
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3",
lineno=3)

def test_utf8_bom_non_utf8_third_line_error(self):
src = (b'\xef\xbb\xbf#first\n'
b'#second\n'
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3|"
br"'utf-8' codec can't decode byte",
lineno=3)

def test_utf_8_non_utf8_third_line_error(self):
src = (b'#coding: utf-8\n'
b'#second\n'
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3|"
br"'utf-8' codec can't decode byte",
lineno=3)

def test_utf8_non_utf8_third_line_error(self):
src = (b'#coding: utf8\n'
b'#\n'
b'#\xa4\n'
b'#second\n'
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"'utf-8' codec can't decode byte|"
Expand Down Expand Up @@ -326,7 +408,7 @@ def test_nul_in_second_coding_line(self):
class UTF8ValidatorTest(unittest.TestCase):
@unittest.skipIf(not sys.platform.startswith("linux"),
"Too slow to run on non-Linux platforms")
@requires_resource('cpu')
@support.requires_resource('cpu')
def test_invalid_utf8(self):
# This is a port of test_utf8_decode_invalid_sequences in
# test_unicode.py to exercise the separate utf8 validator in
Expand Down Expand Up @@ -392,19 +474,29 @@ def check(content):
check(b'\xF4'+cb+b'\xBF\xBF')


@support.force_not_colorized_test_class
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):

def check_script_output(self, src, expected):
with captured_stdout() as stdout:
with support.captured_stdout() as stdout:
exec(src)
out = stdout.getvalue().encode('latin1')
self.assertEqual(out.rstrip(), expected)

def check_script_error(self, src, expected):
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
def check_script_error(self, src, expected, lineno=...):
with self.assertRaises(SyntaxError) as cm:
exec(src)
exc = cm.exception
self.assertRegex(str(exc), expected.decode())
if lineno is not ...:
self.assertEqual(exc.lineno, lineno)
line = src.splitlines()[lineno-1].decode(errors='replace')
if lineno == 1:
line = line.removeprefix('\ufeff')
self.assertEqual(line, exc.text)


@support.force_not_colorized_test_class
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):

def check_script_output(self, src, expected):
Expand All @@ -415,13 +507,22 @@ def check_script_output(self, src, expected):
res = script_helper.assert_python_ok(fn)
self.assertEqual(res.out.rstrip(), expected)

def check_script_error(self, src, expected):
def check_script_error(self, src, expected, lineno=...):
with tempfile.TemporaryDirectory() as tmpd:
fn = os.path.join(tmpd, 'test.py')
with open(fn, 'wb') as fp:
fp.write(src)
res = script_helper.assert_python_failure(fn)
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
err = res.err.rstrip()
self.assertRegex(err.splitlines()[-1], b'SyntaxError: ' + expected)
if lineno is not ...:
self.assertIn(f', line {lineno}\n'.encode(),
err.replace(os.linesep.encode(), b'\n'))
line = src.splitlines()[lineno-1].decode(errors='replace')
if lineno == 1:
line = line.removeprefix('\ufeff')
self.assertIn(line.encode(), err)



if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
encoding is specified. Detect decoding error in comments for default (UTF-8)
encoding. Show the line and position of decoding error for default encoding
in a traceback. Show the line containing the coding cookie when it conflicts
with the BOM in a traceback.
8 changes: 8 additions & 0 deletions Parser/pegen_errors.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <errcode.h>

#include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
#include "pycore_runtime.h" // _Py_ID()
#include "lexer/state.h"
#include "lexer/lexer.h"
#include "pegen.h"
Expand All @@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename)
PyObject *value;
PyObject *tback;
PyErr_Fetch(&type, &value, &tback);
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
goto error;
}
PyErr_Restore(type, value, tback);
return;
}
errstr = PyObject_Str(value);
if (!errstr) {
goto error;
Expand Down
49 changes: 35 additions & 14 deletions Parser/tokenizer/file_tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {
}

static int
tok_underflow_file(struct tok_state *tok) {
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
tok->cur = tok->inp = tok->buf;
}
tok_underflow_file(struct tok_state *tok)
{
if (tok->decoding_state == STATE_INIT) {
/* We have not yet determined the encoding.
If an encoding is found, use the file-pointer
Expand All @@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {
}
assert(tok->decoding_state != STATE_INIT);
}
int raw = tok->decoding_readline == NULL;
if (raw && tok->decoding_state != STATE_NORMAL) {
/* Keep the first line in the buffer to validate it later if
* the encoding has not yet been determined. */
}
else if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
tok->cur = tok->inp = tok->buf;
}
/* Read until '\n' or EOF */
if (tok->decoding_readline != NULL) {
if (!raw) {
/* We already have a codec associated with this input. */
if (!tok_readline_recode(tok)) {
return 0;
Expand Down Expand Up @@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {

ADVANCE_LINENO();
if (tok->decoding_state != STATE_NORMAL) {
if (tok->lineno > 2) {
tok->decoding_state = STATE_NORMAL;
}
else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
tok, fp_setreadl))
{
return 0;
}
if (tok->lineno >= 2) {
tok->decoding_state = STATE_NORMAL;
}
}
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
_PyTokenizer_error_ret(tok);
return 0;
if (raw && tok->decoding_state == STATE_NORMAL) {
const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
if (!tok->encoding) {
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
_PyTokenizer_error_ret(tok);
return 0;
}
}
else {
PyObject *tmp = PyUnicode_Decode(line, strlen(line),
Comment on lines +347 to +358
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
if (!tok->encoding) {
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
_PyTokenizer_error_ret(tok);
return 0;
}
}
else {
PyObject *tmp = PyUnicode_Decode(line, strlen(line),
const int is_pseudo_line = (tok->lineno <= 2);
const char *line = is_pseudo_line ? tok->buf : tok->cur;
int lineno = is_pseudo_line ? 1 : tok->lineno;
size_t slen = strlen(line);
if (slen > (size_t)PY_SSIZE_T_MAX) {
_PyTokenizer_error_ret(tok);
return 0;
}
Py_ssize_t linelen = (Py_ssize_t)slen;
if (!tok->encoding) {
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
_PyTokenizer_error_ret(tok);
return 0;
}
}
else {
PyObject *tmp = PyUnicode_Decode(line, linelen,

tok->encoding, NULL);
if (tmp == NULL) {
_PyTokenizer_error_ret(tok);
return 0;
}
Py_DECREF(tmp);
}
}
assert(tok->done == E_OK);
return tok->done == E_OK;
Expand Down
Loading
Loading