Skip to content

Commit

Permalink
gh-105564: Don't include artificial newlines in the line attribute of…
Browse files Browse the repository at this point in the history
… tokens (#105565)
  • Loading branch information
pablogsal committed Jun 9, 2023
1 parent 1dd267a commit d7f46bc
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 25 deletions.
34 changes: 17 additions & 17 deletions Lib/test/test_peg_generator/test_pegen.py
Expand Up @@ -552,34 +552,34 @@ def test_mutually_left_recursive(self) -> None:
string="D",
start=(1, 0),
end=(1, 1),
line="D A C A E\n",
line="D A C A E",
),
TokenInfo(
type=NAME,
string="A",
start=(1, 2),
end=(1, 3),
line="D A C A E\n",
line="D A C A E",
),
],
TokenInfo(
type=NAME,
string="C",
start=(1, 4),
end=(1, 5),
line="D A C A E\n",
line="D A C A E",
),
],
TokenInfo(
type=NAME,
string="A",
start=(1, 6),
end=(1, 7),
line="D A C A E\n",
line="D A C A E",
),
],
TokenInfo(
type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E\n"
type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E"
),
],
)
Expand All @@ -594,22 +594,22 @@ def test_mutually_left_recursive(self) -> None:
string="B",
start=(1, 0),
end=(1, 1),
line="B C A E\n",
line="B C A E",
),
TokenInfo(
type=NAME,
string="C",
start=(1, 2),
end=(1, 3),
line="B C A E\n",
line="B C A E",
),
],
TokenInfo(
type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E\n"
type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E"
),
],
TokenInfo(
type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E\n"
type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E"
),
],
)
Expand Down Expand Up @@ -655,18 +655,18 @@ def test_lookahead(self) -> None:
node,
[
TokenInfo(
NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 .\n"
NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 ."
),
TokenInfo(
OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 .\n"
OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 ."
),
[
TokenInfo(
NUMBER,
string="12",
start=(1, 6),
end=(1, 8),
line="foo = 12 + 12 .\n",
line="foo = 12 + 12 .",
),
[
[
Expand All @@ -675,14 +675,14 @@ def test_lookahead(self) -> None:
string="+",
start=(1, 9),
end=(1, 10),
line="foo = 12 + 12 .\n",
line="foo = 12 + 12 .",
),
TokenInfo(
NUMBER,
string="12",
start=(1, 11),
end=(1, 13),
line="foo = 12 + 12 .\n",
line="foo = 12 + 12 .",
),
]
],
Expand Down Expand Up @@ -734,9 +734,9 @@ def test_cut(self) -> None:
self.assertEqual(
node,
[
TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)\n"),
TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)\n"),
TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)\n"),
TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)"),
TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)"),
TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)"),
],
)

Expand Down
16 changes: 8 additions & 8 deletions Lib/test/test_tokenize.py
Expand Up @@ -1229,7 +1229,7 @@ def readline():
# skip the initial encoding token and the end tokens
tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8',
extra_tokens=True))[:-2]
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding")

Expand Down Expand Up @@ -1638,8 +1638,8 @@ def test_comment_at_the_end_of_the_source_without_newline(self):
TokenInfo(type=token.NUMBER, string='1', start=(1, 4), end=(1, 5), line='b = 1\n'),
TokenInfo(type=token.NEWLINE, string='\n', start=(1, 5), end=(1, 6), line='b = 1\n'),
TokenInfo(type=token.NL, string='\n', start=(2, 0), end=(2, 1), line='\n'),
TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test\n'),
TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test\n'),
TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test'),
TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test'),
TokenInfo(type=token.ENDMARKER, string='', start=(4, 0), end=(4, 0), line='')
]

Expand All @@ -1653,7 +1653,7 @@ def test_newline_and_space_at_the_end_of_the_source_without_newline(self):
TokenInfo(token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''),
TokenInfo(token.NAME, string='a', start=(1, 0), end=(1, 1), line='a\n'),
TokenInfo(token.NEWLINE, string='\n', start=(1, 1), end=(1, 2), line='a\n'),
TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' \n'),
TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' '),
TokenInfo(token.ENDMARKER, string='', start=(3, 0), end=(3, 0), line='')
]

Expand Down Expand Up @@ -1889,10 +1889,10 @@ def readline(encoding):
yield "1+1".encode(encoding)

expected = [
TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'),
TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'),
TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'),
TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1\n'),
TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1'),
TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1'),
TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1'),
TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1'),
TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
]
for encoding in ["utf-8", "latin-1", "utf-16"]:
Expand Down
@@ -0,0 +1,2 @@
Don't include artificil newlines in the ``line`` attribute of tokens in the
APIs of the :mod:`tokenize` module. Patch by Pablo Galindo
3 changes: 3 additions & 0 deletions Python/Python-tokenize.c
Expand Up @@ -206,6 +206,9 @@ tokenizeriter_next(tokenizeriterobject *it)
line = PyUnicode_FromString("");
} else {
Py_ssize_t size = it->tok->inp - line_start;
if (size >= 1 && it->tok->implicit_newline) {
size -= 1;
}
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
}
if (line == NULL) {
Expand Down

0 comments on commit d7f46bc

Please sign in to comment.