From 7e9910eba54e293bce5c17c0c0d483bbee52452f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 4 Nov 2021 21:58:22 +0200 Subject: [PATCH 1/5] gh-63161: Fix PEP 263 support * Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error in comments for UTF-8 encoding. --- Lib/test/test_source_encoding.py | 63 ++++++++++++++++++- ...5-10-01-18-21-19.gh-issue-63161.ef1S6N.rst | 3 + Parser/tokenizer/file_tokenizer.c | 49 ++++++++++----- Parser/tokenizer/helpers.c | 13 ++-- Parser/tokenizer/helpers.h | 2 +- Parser/tokenizer/readline_tokenizer.c | 2 +- Parser/tokenizer/string_tokenizer.c | 3 + 7 files changed, 113 insertions(+), 22 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 5df40782382120..268f490608eb18 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -267,6 +267,17 @@ def test_second_non_utf8_coding_line(self): b'print(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xc3\u20ac'") + def test_first_utf8_coding_line_error(self): + src = (b'#coding:ascii \xc3\xa4\n' + b'raise RuntimeError\n') + self.check_script_error(src, br"'ascii' codec can't decode byte") + + def test_second_utf8_coding_line_error(self): + src = (b'#!/usr/bin/python\n' + b'#coding:ascii \xc3\xa4\n' + b'raise RuntimeError\n') + self.check_script_error(src, br"'ascii' codec can't decode byte") + def test_utf8_bom(self): src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xe4'") @@ -282,7 +293,57 @@ def test_utf8_bom_and_utf8_coding_line(self): b'print(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xe4'") - def test_utf8_non_utf8_comment_line_error(self): + def test_non_utf8_shebang(self): + src = (b'#!/home/\xa4/bin/python\n' + b'#coding:iso-8859-15\n' + b'print(ascii("\xc3\xa4"))\n') + self.check_script_output(src, br"'\xc3\u20ac'") + + def test_utf8_shebang_error(self): + src = (b'#!/home/\xc3\xa4/bin/python\n' + b'#coding:ascii\n' + b'raise RuntimeError\n') + self.check_script_error(src, br"'ascii' codec can't decode byte") + + def test_non_utf8_shebang_error(self): + src = (b'#!/home/\xa4/bin/python\n' + b'raise RuntimeError\n') + self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1") + + def test_non_utf8_second_line_error(self): + src = (b'#\n' + b'#\xa4\n' + b'raise RuntimeError\n') + self.check_script_error(src, + br"Non-UTF-8 code starting with .* on line 2") + + def test_non_utf8_third_line_error(self): + src = (b'#\n' + b'#\n' + b'#\xa4\n' + b'raise RuntimeError\n') + self.check_script_error(src, + br"Non-UTF-8 code starting with .* on line 3") + + def test_utf8_bom_non_utf8_third_line_error(self): + src = (b'\xef\xbb\xbf#\n' + b'#\n' + b'#\xa4\n' + b'raise RuntimeError\n') + self.check_script_error(src, + br"Non-UTF-8 code starting with .* on line 3|" + br"'utf-8' codec can't decode byte") + + def test_utf_8_non_utf8_third_line_error(self): + src = (b'#coding: utf-8\n' + b'#\n' + b'#\xa4\n' + b'raise RuntimeError\n') + self.check_script_error(src, + br"Non-UTF-8 code starting with .* on line 3|" + br"'utf-8' codec can't decode byte") + + def test_utf8_non_utf8_third_line_error(self): src = (b'#coding: utf8\n' b'#\n' b'#\xa4\n' diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst new file mode 100644 index 00000000000000..2449b3488e5c89 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst @@ -0,0 +1,3 @@ +Support non-UTF-8 shebang and comments in Python source files if non-UTF-8 +encoding is specified. Detect decoding error in comments for default (UTF-8) +encoding. diff --git a/Parser/tokenizer/file_tokenizer.c b/Parser/tokenizer/file_tokenizer.c index 01e473f58a0777..8c836a3f725829 100644 --- a/Parser/tokenizer/file_tokenizer.c +++ b/Parser/tokenizer/file_tokenizer.c @@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) { } static int -tok_underflow_file(struct tok_state *tok) { - if (tok->start == NULL && !INSIDE_FSTRING(tok)) { - tok->cur = tok->inp = tok->buf; - } +tok_underflow_file(struct tok_state *tok) +{ if (tok->decoding_state == STATE_INIT) { /* We have not yet determined the encoding. If an encoding is found, use the file-pointer @@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) { } assert(tok->decoding_state != STATE_INIT); } + int raw = tok->decoding_readline == NULL; + if (raw && tok->decoding_state != STATE_NORMAL) { + /* Keep the first line in the buffer to validate it later if + * the encoding has not yet been determined. */ + } + else if (tok->start == NULL && !INSIDE_FSTRING(tok)) { + tok->cur = tok->inp = tok->buf; + } /* Read until '\n' or EOF */ - if (tok->decoding_readline != NULL) { + if (!raw) { /* We already have a codec associated with this input. */ if (!tok_readline_recode(tok)) { return 0; @@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) { ADVANCE_LINENO(); if (tok->decoding_state != STATE_NORMAL) { - if (tok->lineno > 2) { - tok->decoding_state = STATE_NORMAL; - } - else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur), + if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur), tok, fp_setreadl)) { return 0; } + if (tok->lineno >= 2) { + tok->decoding_state = STATE_NORMAL; + } } - /* The default encoding is UTF-8, so make sure we don't have any - non-UTF-8 sequences in it. */ - if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) { - _PyTokenizer_error_ret(tok); - return 0; + if (raw && tok->decoding_state == STATE_NORMAL) { + const char *line = tok->lineno <= 2 ? tok->buf : tok->cur; + int lineno = tok->lineno <= 2 ? 1 : tok->lineno; + if (!tok->encoding) { + /* The default encoding is UTF-8, so make sure we don't have any + non-UTF-8 sequences in it. */ + if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) { + _PyTokenizer_error_ret(tok); + return 0; + } + } + else { + PyObject *tmp = PyUnicode_Decode(line, strlen(line), + tok->encoding, NULL); + if (tmp == NULL) { + _PyTokenizer_error_ret(tok); + return 0; + } + Py_DECREF(tmp); + } } assert(tok->done == E_OK); return tok->done == E_OK; diff --git a/Parser/tokenizer/helpers.c b/Parser/tokenizer/helpers.c index 5a416adb875aa1..0e89c30deac9ce 100644 --- a/Parser/tokenizer/helpers.c +++ b/Parser/tokenizer/helpers.c @@ -496,24 +496,27 @@ valid_utf8(const unsigned char* s) } int -_PyTokenizer_ensure_utf8(char *line, struct tok_state *tok) +_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno) { int badchar = 0; - unsigned char *c; + const unsigned char *c; int length; - for (c = (unsigned char *)line; *c; c += length) { + for (c = (const unsigned char *)line; *c; c += length) { if (!(length = valid_utf8(c))) { badchar = *c; break; } + if (*c == '\n') { + lineno++; + } } if (badchar) { PyErr_Format(PyExc_SyntaxError, "Non-UTF-8 code starting with '\\x%.2x' " - "in file %U on line %i, " + "in file %V on line %i, " "but no encoding declared; " "see https://peps.python.org/pep-0263/ for details", - badchar, tok->filename, tok->lineno); + badchar, tok->filename, "", lineno); return 0; } return 1; diff --git a/Parser/tokenizer/helpers.h b/Parser/tokenizer/helpers.h index 42ea13cd1f853f..98f6445d5a3b40 100644 --- a/Parser/tokenizer/helpers.h +++ b/Parser/tokenizer/helpers.h @@ -26,7 +26,7 @@ int _PyTokenizer_check_bom(int get_char(struct tok_state *), struct tok_state *tok); int _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, int set_readline(struct tok_state *, const char *)); -int _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok); +int _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno); #ifdef Py_DEBUG void _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size); diff --git a/Parser/tokenizer/readline_tokenizer.c b/Parser/tokenizer/readline_tokenizer.c index 22f84c77a12b47..0f7769aeb8fd57 100644 --- a/Parser/tokenizer/readline_tokenizer.c +++ b/Parser/tokenizer/readline_tokenizer.c @@ -97,7 +97,7 @@ tok_underflow_readline(struct tok_state* tok) { ADVANCE_LINENO(); /* The default encoding is UTF-8, so make sure we don't have any non-UTF-8 sequences in it. */ - if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) { + if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok, tok->lineno)) { _PyTokenizer_error_ret(tok); return 0; } diff --git a/Parser/tokenizer/string_tokenizer.c b/Parser/tokenizer/string_tokenizer.c index 0c26d5df8d4a40..5e9d91e45e031c 100644 --- a/Parser/tokenizer/string_tokenizer.c +++ b/Parser/tokenizer/string_tokenizer.c @@ -102,6 +102,9 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr return _PyTokenizer_error_ret(tok); str = PyBytes_AS_STRING(utf8); } + else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) { + return _PyTokenizer_error_ret(tok); + } assert(tok->decoding_buffer == NULL); tok->decoding_buffer = utf8; /* CAUTION */ return str; From 3ab168a13c2f6dcb1b1ff7ffbd71cb15c834adcf Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 3 Oct 2025 17:23:00 +0300 Subject: [PATCH 2/5] Include the decoding error position for default encoding in SyntaxError. --- Lib/test/test_exceptions.py | 8 +- Lib/test/test_source_encoding.py | 78 ++++++++++++++----- ...5-10-01-18-21-19.gh-issue-63161.ef1S6N.rst | 4 +- Parser/pegen_errors.c | 8 ++ Parser/tokenizer/helpers.c | 47 +++++++---- Parser/tokenizer/string_tokenizer.c | 3 + 6 files changed, 108 insertions(+), 40 deletions(-) diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 59f77f91d85e5c..323a8c401bde6c 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -224,6 +224,8 @@ def check(self, src, lineno, offset, end_lineno=None, end_offset=None, encoding= if not isinstance(src, str): src = src.decode(encoding, 'replace') line = src.split('\n')[lineno-1] + if lineno == 1: + line = line.removeprefix('\ufeff') self.assertIn(line, cm.exception.text) def test_error_offset_continuation_characters(self): @@ -239,7 +241,9 @@ def testSyntaxErrorOffset(self): check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20) check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +', 2, 19, encoding='cp1251') - check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 10) + check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 12) + check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +', 4, 12) + check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +', 1, 12) check('x = "a', 1, 5) check('lambda x: x = 2', 1, 1) check('f{a + b + c}', 1, 2) @@ -287,7 +291,7 @@ def baz(): check("pass\npass\npass\n(1+)\npass\npass\npass", 4, 4) check("(1+)", 1, 4) check("[interesting\nfoo()\n", 1, 1) - check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 0, -1) + check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 1, 0) check("""f''' { (123_a) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 268f490608eb18..f91845d2d50f8e 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -293,6 +293,21 @@ def test_utf8_bom_and_utf8_coding_line(self): b'print(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xe4'") + def test_utf8_bom_and_non_utf8_first_coding_line(self): + src = (b'\xef\xbb\xbf#coding:iso-8859-15\n' + b'raise RuntimeError\n') + self.check_script_error(src, + br"encoding problem: iso-8859-15 with BOM", + lineno=1) + + def test_utf8_bom_and_non_utf8_second_coding_line(self): + src = (b'\xef\xbb\xbf#first\n' + b'#coding:iso-8859-15\n' + b'raise RuntimeError\n') + self.check_script_error(src, + br"encoding problem: iso-8859-15 with BOM", + lineno=2) + def test_non_utf8_shebang(self): src = (b'#!/home/\xa4/bin/python\n' b'#coding:iso-8859-15\n' @@ -308,45 +323,50 @@ def test_utf8_shebang_error(self): def test_non_utf8_shebang_error(self): src = (b'#!/home/\xa4/bin/python\n' b'raise RuntimeError\n') - self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1") + self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1", + lineno=1) def test_non_utf8_second_line_error(self): - src = (b'#\n' - b'#\xa4\n' + src = (b'#first\n' + b'#second\xa4\n' b'raise RuntimeError\n') self.check_script_error(src, - br"Non-UTF-8 code starting with .* on line 2") + br"Non-UTF-8 code starting with .* on line 2", + lineno=2) def test_non_utf8_third_line_error(self): - src = (b'#\n' - b'#\n' - b'#\xa4\n' + src = (b'#first\n' + b'#second\n' + b'#third\xa4\n' b'raise RuntimeError\n') self.check_script_error(src, - br"Non-UTF-8 code starting with .* on line 3") + br"Non-UTF-8 code starting with .* on line 3", + lineno=3) def test_utf8_bom_non_utf8_third_line_error(self): - src = (b'\xef\xbb\xbf#\n' - b'#\n' - b'#\xa4\n' + src = (b'\xef\xbb\xbf#first\n' + b'#second\n' + b'#third\xa4\n' b'raise RuntimeError\n') self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 3|" - br"'utf-8' codec can't decode byte") + br"'utf-8' codec can't decode byte", + lineno=3) def test_utf_8_non_utf8_third_line_error(self): src = (b'#coding: utf-8\n' - b'#\n' - b'#\xa4\n' + b'#second\n' + b'#third\xa4\n' b'raise RuntimeError\n') self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 3|" - br"'utf-8' codec can't decode byte") + br"'utf-8' codec can't decode byte", + lineno=3) def test_utf8_non_utf8_third_line_error(self): src = (b'#coding: utf8\n' - b'#\n' - b'#\xa4\n' + b'#second\n' + b'#third\xa4\n' b'raise RuntimeError\n') self.check_script_error(src, br"'utf-8' codec can't decode byte|" @@ -461,9 +481,17 @@ def check_script_output(self, src, expected): out = stdout.getvalue().encode('latin1') self.assertEqual(out.rstrip(), expected) - def check_script_error(self, src, expected): - with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm: + def check_script_error(self, src, expected, lineno=...): + with self.assertRaises(SyntaxError) as cm: exec(src) + exc = cm.exception + self.assertRegex(str(exc), expected.decode()) + if lineno is not ...: + self.assertEqual(exc.lineno, lineno) + line = src.splitlines()[lineno-1].decode(errors='replace') + if lineno == 1: + line = line.removeprefix('\ufeff') + self.assertEqual(line, exc.text) class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): @@ -476,13 +504,21 @@ def check_script_output(self, src, expected): res = script_helper.assert_python_ok(fn) self.assertEqual(res.out.rstrip(), expected) - def check_script_error(self, src, expected): + def check_script_error(self, src, expected, lineno=...): with tempfile.TemporaryDirectory() as tmpd: fn = os.path.join(tmpd, 'test.py') with open(fn, 'wb') as fp: fp.write(src) res = script_helper.assert_python_failure(fn) - self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected) + err = res.err.rstrip() + self.assertRegex(err.splitlines()[-1], b'SyntaxError.*?' + expected) + if lineno is not ...: + self.assertIn(f', line {lineno}\n'.encode(), err) + line = src.splitlines()[lineno-1].decode(errors='replace') + if lineno == 1: + line = line.removeprefix('\ufeff') + self.assertIn(line.encode(), err) + if __name__ == "__main__": diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst index 2449b3488e5c89..5eafe0813dc933 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst @@ -1,3 +1,5 @@ Support non-UTF-8 shebang and comments in Python source files if non-UTF-8 encoding is specified. Detect decoding error in comments for default (UTF-8) -encoding. +encoding. Show the line and position of decoding error for default encoding +in a traceback. Show the line containing the coding cookie when it conflicts +with the BOM in a traceback. diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index f62b8695995617..0639a4e42436be 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -2,6 +2,7 @@ #include #include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject() +#include "pycore_runtime.h" // _Py_ID() #include "lexer/state.h" #include "lexer/lexer.h" #include "pegen.h" @@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename) PyObject *value; PyObject *tback; PyErr_Fetch(&type, &value, &tback); + if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) { + if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) { + goto error; + } + PyErr_Restore(type, value, tback); + return; + } errstr = PyObject_Str(value); if (!errstr) { goto error; diff --git a/Parser/tokenizer/helpers.c b/Parser/tokenizer/helpers.c index 0e89c30deac9ce..eb55b9dc617aaf 100644 --- a/Parser/tokenizer/helpers.c +++ b/Parser/tokenizer/helpers.c @@ -47,8 +47,10 @@ _syntaxerror_range(struct tok_state *tok, const char *format, goto error; } - args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno, - col_offset, errtext, tok->lineno, end_col_offset); + args = Py_BuildValue("(O(OiiNii))", errmsg, + tok->filename ? tok->filename : Py_None, + tok->lineno, col_offset, errtext, + tok->lineno, end_col_offset); if (args) { PyErr_SetObject(PyExc_SyntaxError, args); Py_DECREF(args); @@ -422,10 +424,12 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta tok->encoding = cs; } else { /* then, compare cs with BOM */ if (strcmp(tok->encoding, cs) != 0) { - _PyTokenizer_error_ret(tok); - PyErr_Format(PyExc_SyntaxError, - "encoding problem: %s with BOM", cs); + tok->line_start = line; + tok->cur = (char *)line; + _PyTokenizer_syntaxerror_known_range(tok, 0, size, + "encoding problem: %s with BOM", cs); PyMem_Free(cs); + _PyTokenizer_error_ret(tok); return 0; } PyMem_Free(cs); @@ -498,25 +502,36 @@ valid_utf8(const unsigned char* s) int _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno) { - int badchar = 0; - const unsigned char *c; + const char *badchar = NULL; + const char *c; int length; - for (c = (const unsigned char *)line; *c; c += length) { - if (!(length = valid_utf8(c))) { - badchar = *c; + int col_offset = 0; + const char *line_start = line; + for (c = line; *c; c += length) { + if (!(length = valid_utf8((const unsigned char *)c))) { + badchar = c; break; } + col_offset++; if (*c == '\n') { lineno++; + col_offset = 0; + line_start = c + 1; } } if (badchar) { - PyErr_Format(PyExc_SyntaxError, - "Non-UTF-8 code starting with '\\x%.2x' " - "in file %V on line %i, " - "but no encoding declared; " - "see https://peps.python.org/pep-0263/ for details", - badchar, tok->filename, "", lineno); + tok->lineno = lineno; + tok->line_start = line_start; + tok->cur = (char *)badchar; + _PyTokenizer_syntaxerror_known_range(tok, + col_offset + 1, col_offset + 1, + "Non-UTF-8 code starting with '\\x%.2x'" + "%s%V on line %i, " + "but no encoding declared; " + "see https://peps.python.org/pep-0263/ for details", + (unsigned char)*badchar, + tok->filename ? " in file " : "", tok->filename, "", + lineno); return 0; } return 1; diff --git a/Parser/tokenizer/string_tokenizer.c b/Parser/tokenizer/string_tokenizer.c index 5e9d91e45e031c..7299ecf483ccd9 100644 --- a/Parser/tokenizer/string_tokenizer.c +++ b/Parser/tokenizer/string_tokenizer.c @@ -86,15 +86,18 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr /* need to check line 1 and 2 separately since check_coding_spec assumes a single line as input */ if (newl[0]) { + tok->lineno = 1; if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { return NULL; } if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { + tok->lineno = 2; if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0], tok, buf_setreadl)) return NULL; } } + tok->lineno = 0; if (tok->enc != NULL) { assert(utf8 == NULL); utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); From 9fd1bb298f1699004381dbf3f06ea4abd1fee398 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 3 Oct 2025 17:46:34 +0300 Subject: [PATCH 3/5] Try to disable colorization. --- Lib/test/test_source_encoding.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index f91845d2d50f8e..8cec46e925e5d3 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- import unittest -from test.support import script_helper, captured_stdout, requires_subprocess, requires_resource +from test import support +from test.support import script_helper from test.support.os_helper import TESTFN, unlink, rmtree from test.support.import_helper import unload import importlib @@ -64,7 +65,7 @@ def test_issue7820(self): # two bytes in common with the UTF-8 BOM self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20') - @requires_subprocess() + @support.requires_subprocess() def test_20731(self): sub = subprocess.Popen([sys.executable, os.path.join(os.path.dirname(__file__), @@ -270,13 +271,13 @@ def test_second_non_utf8_coding_line(self): def test_first_utf8_coding_line_error(self): src = (b'#coding:ascii \xc3\xa4\n' b'raise RuntimeError\n') - self.check_script_error(src, br"'ascii' codec can't decode byte") + self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte") def test_second_utf8_coding_line_error(self): src = (b'#!/usr/bin/python\n' b'#coding:ascii \xc3\xa4\n' b'raise RuntimeError\n') - self.check_script_error(src, br"'ascii' codec can't decode byte") + self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte") def test_utf8_bom(self): src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n') @@ -318,7 +319,7 @@ def test_utf8_shebang_error(self): src = (b'#!/home/\xc3\xa4/bin/python\n' b'#coding:ascii\n' b'raise RuntimeError\n') - self.check_script_error(src, br"'ascii' codec can't decode byte") + self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte") def test_non_utf8_shebang_error(self): src = (b'#!/home/\xa4/bin/python\n' @@ -407,7 +408,7 @@ def test_nul_in_second_coding_line(self): class UTF8ValidatorTest(unittest.TestCase): @unittest.skipIf(not sys.platform.startswith("linux"), "Too slow to run on non-Linux platforms") - @requires_resource('cpu') + @support.requires_resource('cpu') def test_invalid_utf8(self): # This is a port of test_utf8_decode_invalid_sequences in # test_unicode.py to exercise the separate utf8 validator in @@ -473,10 +474,11 @@ def check(content): check(b'\xF4'+cb+b'\xBF\xBF') +@support.force_not_colorized_test_class class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): def check_script_output(self, src, expected): - with captured_stdout() as stdout: + with support.captured_stdout() as stdout: exec(src) out = stdout.getvalue().encode('latin1') self.assertEqual(out.rstrip(), expected) @@ -494,6 +496,7 @@ def check_script_error(self, src, expected, lineno=...): self.assertEqual(line, exc.text) +@support.force_not_colorized_test_class class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): def check_script_output(self, src, expected): @@ -511,7 +514,7 @@ def check_script_error(self, src, expected, lineno=...): fp.write(src) res = script_helper.assert_python_failure(fn) err = res.err.rstrip() - self.assertRegex(err.splitlines()[-1], b'SyntaxError.*?' + expected) + self.assertRegex(err.splitlines()[-1], b'SyntaxError: ' + expected) if lineno is not ...: self.assertIn(f', line {lineno}\n'.encode(), err) line = src.splitlines()[lineno-1].decode(errors='replace') From 62993b3fe6bfc541b0cbac791e50a1eda7035cb4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 3 Oct 2025 18:28:01 +0300 Subject: [PATCH 4/5] Fix tests on Windows. --- Lib/test/test_source_encoding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 8cec46e925e5d3..46b291192df429 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -516,7 +516,8 @@ def check_script_error(self, src, expected, lineno=...): err = res.err.rstrip() self.assertRegex(err.splitlines()[-1], b'SyntaxError: ' + expected) if lineno is not ...: - self.assertIn(f', line {lineno}\n'.encode(), err) + self.assertIn(f', line {lineno}\n'.encode(), + err.replace(os.linesep.encode(), b'\n')) line = src.splitlines()[lineno-1].decode(errors='replace') if lineno == 1: line = line.removeprefix('\ufeff') From f932ebd9df0b5f4c7ea94d00437f9f54c9d23f59 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 10 Oct 2025 14:46:35 +0300 Subject: [PATCH 5/5] Silence some compiler warnings. --- Parser/tokenizer/helpers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Parser/tokenizer/helpers.c b/Parser/tokenizer/helpers.c index eb55b9dc617aaf..e5e2eed2d34aee 100644 --- a/Parser/tokenizer/helpers.c +++ b/Parser/tokenizer/helpers.c @@ -426,7 +426,8 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta if (strcmp(tok->encoding, cs) != 0) { tok->line_start = line; tok->cur = (char *)line; - _PyTokenizer_syntaxerror_known_range(tok, 0, size, + assert(size <= INT_MAX); + _PyTokenizer_syntaxerror_known_range(tok, 0, (int)size, "encoding problem: %s with BOM", cs); PyMem_Free(cs); _PyTokenizer_error_ret(tok);