Skip to content

Commit

Permalink
bpo-33899: Make tokenize module mirror end-of-file is end-of-line beh…
Browse files Browse the repository at this point in the history
…avior (GH-7891)

Most of the change involves fixing up the test suite, which previously made
the assumption that there wouldn't be a new line if the input didn't end in
one.

Contributed by Ammar Askar.
  • Loading branch information
ammaraskar authored and taleinat committed Jul 6, 2018
1 parent 3c8aae9 commit c4ef489
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 24 deletions.
71 changes: 47 additions & 24 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from test import support
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens)
open as tokenize_open, Untokenizer, generate_tokens,
NEWLINE)
from io import BytesIO, StringIO
import unittest
from unittest import TestCase, mock
Expand All @@ -11,27 +12,51 @@
import token


# Converts a source string into a list of textual representation
# of the tokens such as:
# ` NAME 'if' (1, 0) (1, 2)`
# to make writing tests easier.
def stringify_tokens_from_source(token_generator, source_string):
result = []
num_lines = len(source_string.splitlines())
missing_trailing_nl = source_string[-1] not in '\r\n'

for type, token, start, end, line in token_generator:
if type == ENDMARKER:
break
# Ignore the new line on the last line if the input lacks one
if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
continue
type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")

return result

class TokenizeTest(TestCase):
# Tests for the tokenize module.

# The tests can be really simple. Given a small fragment of source
# code, print out a table with tokens. The ENDMARKER is omitted for
# brevity.
# code, print out a table with tokens. The ENDMARKER, ENCODING and
# final NEWLINE are omitted for brevity.

def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER is omitted.
result = []
# The ENDMARKER and final NEWLINE are omitted.
f = BytesIO(s.encode('utf-8'))
for type, token, start, end, line in tokenize(f.readline):
if type == ENDMARKER:
break
type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")
result = stringify_tokens_from_source(tokenize(f.readline), s)

self.assertEqual(result,
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
expected.rstrip().splitlines())

def test_implicit_newline(self):
# Make sure that the tokenizer puts in an implicit NEWLINE
# when the input lacks a trailing new line.
f = BytesIO("x".encode('utf-8'))
tokens = list(tokenize(f.readline))
self.assertEqual(tokens[-2].type, NEWLINE)
self.assertEqual(tokens[-1].type, ENDMARKER)

def test_basic(self):
self.check_tokenize("1 + 1", """\
NUMBER '1' (1, 0) (1, 1)
Expand Down Expand Up @@ -922,14 +947,9 @@ async def bar(): pass
class GenerateTokensTest(TokenizeTest):
def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER is omitted.
result = []
# The ENDMARKER and final NEWLINE are omitted.
f = StringIO(s)
for type, token, start, end, line in generate_tokens(f.readline):
if type == ENDMARKER:
break
type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")
result = stringify_tokens_from_source(generate_tokens(f.readline), s)
self.assertEqual(result, expected.rstrip().splitlines())


Expand Down Expand Up @@ -1022,8 +1042,8 @@ def readline():
else:
return b''

# skip the initial encoding token and the end token
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
# skip the initial encoding token and the end tokens
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding")
Expand All @@ -1039,8 +1059,8 @@ def readline():
else:
return b''

# skip the end token
tokens = list(_tokenize(readline, encoding=None))[:-1]
# skip the end tokens
tokens = list(_tokenize(readline, encoding=None))[:-2]
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
self.assertEqual(tokens, expected_tokens,
"string not tokenized when encoding is None")
Expand Down Expand Up @@ -1351,18 +1371,21 @@ def test_oneline_defs(self):

# Test that 500 consequent, one-line defs is OK
toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
# [-2] is always NEWLINE

def assertExactTypeEqual(self, opstr, *optypes):
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
num_optypes = len(optypes)
self.assertEqual(len(tokens), 2 + num_optypes)
self.assertEqual(len(tokens), 3 + num_optypes)
self.assertEqual(tok_name[tokens[0].exact_type],
tok_name[ENCODING])
for i in range(num_optypes):
self.assertEqual(tok_name[tokens[i + 1].exact_type],
tok_name[optypes[i]])
self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
tok_name[token.NEWLINE])
self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
tok_name[token.ENDMARKER])

def test_exact_type(self):
Expand Down Expand Up @@ -1515,7 +1538,7 @@ def test_roundtrip(self):
self.check_roundtrip("if x == 1:\n"
" print(x)\n")
self.check_roundtrip("# This is a comment\n"
"# This also")
"# This also\n")

# Some people use different formatting conventions, which makes
# untokenize a little trickier. Note that this test involves trailing
Expand Down
10 changes: 10 additions & 0 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,15 @@ def _tokenize(readline, encoding):
# BOM will already have been stripped.
encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
last_line = b''
line = b''
while True: # loop over lines in stream
try:
# We capture the value of the line variable here because
# readline uses the empty string '' to signal end of input,
# hence `line` itself will always be overwritten at the end
# of this loop.
last_line = line
line = readline()
except StopIteration:
line = b''
Expand Down Expand Up @@ -648,6 +655,9 @@ def _tokenize(readline, encoding):
(lnum, pos), (lnum, pos+1), line)
pos += 1

# Add an implicit NEWLINE if the input doesn't end in one
if last_line and last_line[-1] not in '\r\n':
yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
for indent in indents[1:]: # pop remaining indent levels
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Tokenize module now implicitly emits a NEWLINE when provided with input that
does not have a trailing new line. This behavior now matches what the C
tokenizer does internally. Contributed by Ammar Askar.

0 comments on commit c4ef489

Please sign in to comment.