diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ab53a20cff5539..8c91e271f4d5e8 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3188,6 +3188,16 @@ def get_tokens(string): with self.subTest(case=case): self.assertRaises(tokenize.TokenError, get_tokens, case) + def test_tstring_multiline_bang_underflow(self): + # gh-149183: t-string with '!' across two lines used to raise + # MemoryError because last_expr_end > last_expr_size produced a + # negative length that was cast to a huge size_t. + self.assertRaises( + tokenize.TokenError, + list, + tokenize.tokenize(BytesIO(b't"{!\n!x').readline), + ) + @support.skip_wasi_stack_overflow() def test_max_indent(self): MAXINDENT = 100 diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 7f25afec302c22..748df54338f6a3 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -121,12 +121,22 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } PyObject *res = NULL; + Py_ssize_t expr_len = tok_mode->last_expr_size - tok_mode->last_expr_end; + if (expr_len < 0) { + /* last_expr_end > last_expr_size: happens when '{' and the closing + delimiter span different source lines, causing the strlen-based + size tracking to underflow. Treat as a tokenizer error rather + than passing a negative length (cast to huge size_t) to malloc or + PyUnicode_DecodeUTF8. */ + return -1; + } + // Look for a # character outside of string literals int hash_detected = 0; int in_string = 0; char quote_char = 0; - for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { + for (Py_ssize_t i = 0; i < expr_len; i++) { char ch = tok_mode->last_expr_buffer[i]; // Skip escaped characters @@ -163,7 +173,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { // If we found a # character in the expression, we need to handle comments if (hash_detected) { // Allocate buffer for processed result - char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); + char *result = (char *)PyMem_Malloc((expr_len + 1) * sizeof(char)); if (!result) { return -1; } @@ -174,7 +184,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { quote_char = 0; // Current string quote char // Process each character - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { + while (i < expr_len) { char ch = tok_mode->last_expr_buffer[i]; // Handle string quotes @@ -190,11 +200,10 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } // Skip comments else if (ch == '#' && !in_string) { - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && - tok_mode->last_expr_buffer[i] != '\n') { + while (i < expr_len && tok_mode->last_expr_buffer[i] != '\n') { i++; } - if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { + if (i < expr_len) { result[j++] = '\n'; } } @@ -211,7 +220,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } else { res = PyUnicode_DecodeUTF8( tok_mode->last_expr_buffer, - tok_mode->last_expr_size - tok_mode->last_expr_end, + expr_len, NULL ); }