From d7f46bcd989580340675bf0a9fdbfa1505a37e81 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Fri, 9 Jun 2023 17:01:26 +0100 Subject: [PATCH] gh-105564: Don't include artificial newlines in the line attribute of tokens (#105565) --- Lib/test/test_peg_generator/test_pegen.py | 34 +++++++++---------- Lib/test/test_tokenize.py | 16 ++++----- ...-06-09-15-25-12.gh-issue-105564.sFdUu4.rst | 2 ++ Python/Python-tokenize.c | 3 ++ 4 files changed, 30 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst diff --git a/Lib/test/test_peg_generator/test_pegen.py b/Lib/test/test_peg_generator/test_pegen.py index 876bf789f48282..3af2c0cf47d20a 100644 --- a/Lib/test/test_peg_generator/test_pegen.py +++ b/Lib/test/test_peg_generator/test_pegen.py @@ -552,14 +552,14 @@ def test_mutually_left_recursive(self) -> None: string="D", start=(1, 0), end=(1, 1), - line="D A C A E\n", + line="D A C A E", ), TokenInfo( type=NAME, string="A", start=(1, 2), end=(1, 3), - line="D A C A E\n", + line="D A C A E", ), ], TokenInfo( @@ -567,7 +567,7 @@ def test_mutually_left_recursive(self) -> None: string="C", start=(1, 4), end=(1, 5), - line="D A C A E\n", + line="D A C A E", ), ], TokenInfo( @@ -575,11 +575,11 @@ def test_mutually_left_recursive(self) -> None: string="A", start=(1, 6), end=(1, 7), - line="D A C A E\n", + line="D A C A E", ), ], TokenInfo( - type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E\n" + type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E" ), ], ) @@ -594,22 +594,22 @@ def test_mutually_left_recursive(self) -> None: string="B", start=(1, 0), end=(1, 1), - line="B C A E\n", + line="B C A E", ), TokenInfo( type=NAME, string="C", start=(1, 2), end=(1, 3), - line="B C A E\n", + line="B C A E", ), ], TokenInfo( - type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E\n" + type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E" ), ], TokenInfo( - type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E\n" + type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E" ), ], ) @@ -655,10 +655,10 @@ def test_lookahead(self) -> None: node, [ TokenInfo( - NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 .\n" + NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 ." ), TokenInfo( - OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 .\n" + OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 ." ), [ TokenInfo( @@ -666,7 +666,7 @@ def test_lookahead(self) -> None: string="12", start=(1, 6), end=(1, 8), - line="foo = 12 + 12 .\n", + line="foo = 12 + 12 .", ), [ [ @@ -675,14 +675,14 @@ def test_lookahead(self) -> None: string="+", start=(1, 9), end=(1, 10), - line="foo = 12 + 12 .\n", + line="foo = 12 + 12 .", ), TokenInfo( NUMBER, string="12", start=(1, 11), end=(1, 13), - line="foo = 12 + 12 .\n", + line="foo = 12 + 12 .", ), ] ], @@ -734,9 +734,9 @@ def test_cut(self) -> None: self.assertEqual( node, [ - TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)\n"), - TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)\n"), - TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)\n"), + TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)"), + TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)"), + TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)"), ], ) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 6747b0d8f65a17..2c124f062e7fd6 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1229,7 +1229,7 @@ def readline(): # skip the initial encoding token and the end tokens tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8', extra_tokens=True))[:-2] - expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')] + expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -1638,8 +1638,8 @@ def test_comment_at_the_end_of_the_source_without_newline(self): TokenInfo(type=token.NUMBER, string='1', start=(1, 4), end=(1, 5), line='b = 1\n'), TokenInfo(type=token.NEWLINE, string='\n', start=(1, 5), end=(1, 6), line='b = 1\n'), TokenInfo(type=token.NL, string='\n', start=(2, 0), end=(2, 1), line='\n'), - TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test\n'), - TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test\n'), + TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test'), + TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test'), TokenInfo(type=token.ENDMARKER, string='', start=(4, 0), end=(4, 0), line='') ] @@ -1653,7 +1653,7 @@ def test_newline_and_space_at_the_end_of_the_source_without_newline(self): TokenInfo(token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''), TokenInfo(token.NAME, string='a', start=(1, 0), end=(1, 1), line='a\n'), TokenInfo(token.NEWLINE, string='\n', start=(1, 1), end=(1, 2), line='a\n'), - TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' \n'), + TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' '), TokenInfo(token.ENDMARKER, string='', start=(3, 0), end=(3, 0), line='') ] @@ -1889,10 +1889,10 @@ def readline(encoding): yield "1+1".encode(encoding) expected = [ - TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'), - TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'), - TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'), - TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1\n'), + TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1'), + TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1'), + TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1'), + TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1'), TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') ] for encoding in ["utf-8", "latin-1", "utf-16"]: diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst new file mode 100644 index 00000000000000..9809fac49164f5 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst @@ -0,0 +1,2 @@ +Don't include artificil newlines in the ``line`` attribute of tokens in the +APIs of the :mod:`tokenize` module. Patch by Pablo Galindo diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 2cf052a0cdeb3b..1938562706914c 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -206,6 +206,9 @@ tokenizeriter_next(tokenizeriterobject *it) line = PyUnicode_FromString(""); } else { Py_ssize_t size = it->tok->inp - line_start; + if (size >= 1 && it->tok->implicit_newline) { + size -= 1; + } line = PyUnicode_DecodeUTF8(line_start, size, "replace"); } if (line == NULL) {