Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-104972: Ensure that line attributes in tokens in the tokenize module are correct #104975

Merged
merged 1 commit into from
May 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions Lib/idlelib/idle_test/test_editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,8 @@ def test_searcher(self):
test_info = (# text, (block, indent))
("", (None, None)),
("[1,", (None, None)), # TokenError
("if 1:\n", ('if 1:', None)),
("if 1:\n 2\n 3\n", ('if 1:', ' 2')),
("if 1:\n", ('if 1:\n', None)),
("if 1:\n 2\n 3\n", ('if 1:\n', ' 2\n')),
)
for code, expected_pair in test_info:
with self.subTest(code=code):
Expand Down
15 changes: 13 additions & 2 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,7 @@ def readline():

# skip the initial encoding token and the end tokens
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding")

Expand Down Expand Up @@ -1657,7 +1657,6 @@ def check_roundtrip(self, f):
code = f.encode('utf-8')
else:
code = f.read()
f.close()
readline = iter(code.splitlines(keepends=True)).__next__
tokens5 = list(tokenize(readline))
tokens2 = [tok[:2] for tok in tokens5]
Expand All @@ -1672,6 +1671,17 @@ def check_roundtrip(self, f):
tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
self.assertEqual(tokens2_from5, tokens2)

def check_line_extraction(self, f):
if isinstance(f, str):
code = f.encode('utf-8')
else:
code = f.read()
readline = iter(code.splitlines(keepends=True)).__next__
for tok in tokenize(readline):
if tok.type in {ENCODING, ENDMARKER}:
continue
self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]])

def test_roundtrip(self):
# There are some standard formatting practices that are easy to get right.

Expand Down Expand Up @@ -1768,6 +1778,7 @@ def test_random_files(self):
with open(testfile, 'rb') as f:
# with self.subTest(file=testfile):
self.check_roundtrip(f)
self.check_line_extraction(f)


def roundtrip(self, code):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Ensure that the ``line`` attribute in :class:`tokenize.TokenInfo` objects in
the :mod:`tokenize` module are always correct. Patch by Pablo Galindo
9 changes: 4 additions & 5 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,15 +194,14 @@ tokenizeriter_next(tokenizeriterobject *it)
goto exit;
}

Py_ssize_t size = it->tok->inp - it->tok->buf;
assert(it->tok->buf[size-1] == '\n');
size -= 1; // Remove the newline character from the end of the line
PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
Py_ssize_t size = it->tok->inp - line_start;
PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
if (line == NULL) {
Py_DECREF(str);
goto exit;
}
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;

Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
Py_ssize_t end_lineno = it->tok->lineno;
Py_ssize_t col_offset = -1;
Expand Down