From e6d28ba121bdcfe3b4ff2392cc2bc1e98f6c2811 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 5 May 2023 11:15:49 +0200 Subject: [PATCH 1/3] [3.11] gh-96670: Raise SyntaxError when parsing NULL bytes (GH-97594). (cherry picked from commit aab01e3524d966dca6e72c718a2c71403a14e47c) Co-authored-by: Pablo Galindo Salgado --- Include/cpython/fileobject.h | 1 + Lib/test/test_ast.py | 4 ++++ Lib/test/test_builtin.py | 3 +-- Lib/test/test_cmd_line_script.py | 12 ++++++++++++ Lib/test/test_compile.py | 8 ++++---- Objects/fileobject.c | 29 +++++++++++++++++++---------- Parser/tokenizer.c | 19 +++++++++++++++---- Python/pythonrun.c | 2 +- 8 files changed, 57 insertions(+), 21 deletions(-) diff --git a/Include/cpython/fileobject.h b/Include/cpython/fileobject.h index cff2243d625e76..b70ec318986d82 100644 --- a/Include/cpython/fileobject.h +++ b/Include/cpython/fileobject.h @@ -3,6 +3,7 @@ #endif PyAPI_FUNC(char *) Py_UniversalNewlineFgets(char *, int, FILE*, PyObject *); +PyAPI_FUNC(char *) _Py_UniversalNewlineFgetsWithSize(char *, int, FILE*, PyObject *, size_t*); /* The std printer acts as a preliminary sys.stderr until the new io infrastructure is in place. */ diff --git a/Lib/test/test_ast.py b/Lib/test/test_ast.py index b4ec1fef5ade05..7d9d0c431a66ff 100644 --- a/Lib/test/test_ast.py +++ b/Lib/test/test_ast.py @@ -857,6 +857,10 @@ def check_limit(prefix, repeated): check_limit("a", "[0]") check_limit("a", "*a") + def test_null_bytes(self): + with self.assertRaises(SyntaxError, + msg="source code string cannot contain null bytes"): + ast.parse("a\0b") class ASTHelpers_Test(unittest.TestCase): maxDiff = None diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index 0f6d2db0ecb6d4..9078c409cc44b2 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -334,11 +334,10 @@ def test_compile(self): self.assertRaises(TypeError, compile) self.assertRaises(ValueError, compile, 'print(42)\n', '', 'badmode') self.assertRaises(ValueError, compile, 'print(42)\n', '', 'single', 0xff) - self.assertRaises(ValueError, compile, chr(0), 'f', 'exec') self.assertRaises(TypeError, compile, 'pass', '?', 'exec', mode='eval', source='0', filename='tmp') compile('print("\xe5")\n', '', 'exec') - self.assertRaises(ValueError, compile, chr(0), 'f', 'exec') + self.assertRaises(SyntaxError, compile, chr(0), 'f', 'exec') self.assertRaises(ValueError, compile, str('a = 1'), 'f', 'bad') # test the optimize argument diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py index 4dadbc0b64bdb7..f10d72ea5547ee 100644 --- a/Lib/test/test_cmd_line_script.py +++ b/Lib/test/test_cmd_line_script.py @@ -657,6 +657,18 @@ def test_syntaxerror_invalid_escape_sequence_multi_line(self): ], ) + def test_syntaxerror_null_bytes(self): + script = "x = '\0' nothing to see here\n';import os;os.system('echo pwnd')\n" + with os_helper.temp_dir() as script_dir: + script_name = _make_test_script(script_dir, 'script', script) + exitcode, stdout, stderr = assert_python_failure(script_name) + self.assertEqual( + stderr.splitlines()[-2:], + [ b" x = '", + b'SyntaxError: source code cannot contain null bytes' + ], + ) + def test_consistent_sys_path_for_direct_execution(self): # This test case ensures that the following all give the same # sys.path configuration: diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py index 54e90663ab5100..c96ae4375df883 100644 --- a/Lib/test/test_compile.py +++ b/Lib/test/test_compile.py @@ -542,7 +542,7 @@ def test_particularly_evil_undecodable(self): with open(fn, "wb") as fp: fp.write(src) res = script_helper.run_python_until_end(fn)[0] - self.assertIn(b"Non-UTF-8", res.err) + self.assertIn(b"source code cannot contain null bytes", res.err) def test_yet_more_evil_still_undecodable(self): # Issue #25388 @@ -552,7 +552,7 @@ def test_yet_more_evil_still_undecodable(self): with open(fn, "wb") as fp: fp.write(src) res = script_helper.run_python_until_end(fn)[0] - self.assertIn(b"Non-UTF-8", res.err) + self.assertIn(b"source code cannot contain null bytes", res.err) @support.cpython_only def test_compiler_recursion_limit(self): @@ -588,9 +588,9 @@ def check_limit(prefix, repeated, mode="single"): def test_null_terminated(self): # The source code is null-terminated internally, but bytes-like # objects are accepted, which could be not terminated. - with self.assertRaisesRegex(ValueError, "cannot contain null"): + with self.assertRaisesRegex(SyntaxError, "cannot contain null"): compile("123\x00", "", "eval") - with self.assertRaisesRegex(ValueError, "cannot contain null"): + with self.assertRaisesRegex(SyntaxError, "cannot contain null"): compile(memoryview(b"123\x00"), "", "eval") code = compile(memoryview(b"123\x00")[1:-1], "", "eval") self.assertEqual(eval(code), 23) diff --git a/Objects/fileobject.c b/Objects/fileobject.c index 8dba5b9aea6db1..ffe55eb7c3730e 100644 --- a/Objects/fileobject.c +++ b/Objects/fileobject.c @@ -230,16 +230,8 @@ _PyLong_FileDescriptor_Converter(PyObject *o, void *ptr) return 1; } -/* -** Py_UniversalNewlineFgets is an fgets variation that understands -** all of \r, \n and \r\n conventions. -** The stream should be opened in binary mode. -** The fobj parameter exists solely for legacy reasons and must be NULL. -** Note that we need no error handling: fgets() treats error and eof -** identically. -*/ char * -Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj) +_Py_UniversalNewlineFgetsWithSize(char *buf, int n, FILE *stream, PyObject *fobj, size_t* size) { char *p = buf; int c; @@ -265,11 +257,28 @@ Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj) } FUNLOCKFILE(stream); *p = '\0'; - if (p == buf) + if (p == buf) { return NULL; + } + *size = p - buf; return buf; } +/* +** Py_UniversalNewlineFgets is an fgets variation that understands +** all of \r, \n and \r\n conventions. +** The stream should be opened in binary mode. +** The fobj parameter exists solely for legacy reasons and must be NULL. +** Note that we need no error handling: fgets() treats error and eof +** identically. +*/ + +char * +Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj) { + size_t size; + return _Py_UniversalNewlineFgetsWithSize(buf, n, stream, fobj, &size); +} + /* **************************** std printer **************************** * The stdprinter is used during the boot strapping phase as a preliminary * file like object for sys.stderr. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index ca11c7bebb4eb1..1ccff0e13b7068 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -376,6 +376,11 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) return 1; } +static inline int +contains_null_bytes(const char* str, size_t size) { + return memchr(str, 0, size) != NULL; +} + static int tok_readline_recode(struct tok_state *tok) { PyObject *line; @@ -831,9 +836,9 @@ tok_readline_raw(struct tok_state *tok) if (!tok_reserve_buf(tok, BUFSIZ)) { return 0; } - char *line = Py_UniversalNewlineFgets(tok->inp, - (int)(tok->end - tok->inp), - tok->fp, NULL); + int n_chars = (int)(tok->end - tok->inp); + size_t line_size = 0; + char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size); if (line == NULL) { return 1; } @@ -841,7 +846,7 @@ tok_readline_raw(struct tok_state *tok) tok_concatenate_interactive_new_line(tok, line) == -1) { return 0; } - tok->inp = strchr(tok->inp, '\0'); + tok->inp += line_size; if (tok->inp == tok->buf) { return 0; } @@ -1078,6 +1083,12 @@ tok_nextc(struct tok_state *tok) return EOF; } tok->line_start = tok->cur; + + if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { + syntaxerror(tok, "source code cannot contain null bytes"); + tok->cur = tok->inp; + return EOF; + } } Py_UNREACHABLE(); } diff --git a/Python/pythonrun.c b/Python/pythonrun.c index f12b9f6e9539a2..efa22b07256c82 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -1859,7 +1859,7 @@ _Py_SourceAsString(PyObject *cmd, const char *funcname, const char *what, PyComp } if (strlen(str) != (size_t)size) { - PyErr_SetString(PyExc_ValueError, + PyErr_SetString(PyExc_SyntaxError, "source code string cannot contain null bytes"); Py_CLEAR(*cmd_copy); return NULL; From c0b64eb73bdbdd1e46527e679dcb03c2953a610b Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 5 May 2023 11:23:45 +0200 Subject: [PATCH 2/3] [3.11] Raise null bytes syntax error upon null in multiline string --- Lib/test/test_cmd_line_script.py | 13 +++++++++++++ Parser/tokenizer.c | 6 +++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py index f10d72ea5547ee..d10012759c370b 100644 --- a/Lib/test/test_cmd_line_script.py +++ b/Lib/test/test_cmd_line_script.py @@ -669,6 +669,19 @@ def test_syntaxerror_null_bytes(self): ], ) + def test_syntaxerror_null_bytes_in_multiline_string(self): + scripts = ["\n'''\nmultilinestring\0\n'''", "\nf'''\nmultilinestring\0\n'''"] # Both normal and f-strings + with os_helper.temp_dir() as script_dir: + for script in scripts: + script_name = _make_test_script(script_dir, 'script', script) + _, _, stderr = assert_python_failure(script_name) + self.assertEqual( + stderr.splitlines()[-2:], + [ b" multilinestring", + b'SyntaxError: source code cannot contain null bytes' + ] + ) + def test_consistent_sys_path_for_direct_execution(self): # This test case ensures that the following all give the same # sys.path configuration: diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 1ccff0e13b7068..b552b4171888f5 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1998,8 +1998,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Get rest of string */ while (end_quote_size != quote_size) { c = tok_nextc(tok); - if (tok->done == E_DECODE) + if (tok->done == E_ERROR) { + return ERRORTOKEN; + } + if (tok->done == E_DECODE) { break; + } if (c == EOF || (quote_size == 1 && c == '\n')) { assert(tok->multi_line_start != NULL); // shift the tok_state's location into From de5f0b34de7f6db43485630eb8d1c6c33f523c6a Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 5 May 2023 19:09:33 +0200 Subject: [PATCH 3/3] Add news blurb --- .../2022-09-27-11-59-13.gh-issue-96670.XrBBit.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-09-27-11-59-13.gh-issue-96670.XrBBit.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-09-27-11-59-13.gh-issue-96670.XrBBit.rst b/Misc/NEWS.d/next/Core and Builtins/2022-09-27-11-59-13.gh-issue-96670.XrBBit.rst new file mode 100644 index 00000000000000..eea90e76ee9945 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-09-27-11-59-13.gh-issue-96670.XrBBit.rst @@ -0,0 +1,2 @@ +The parser now raises :exc:`SyntaxError` when parsing source code containing +null bytes. Backported from ``aab01e3``. Patch by Pablo Galindo