diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst index 4c0a0ceef7dc4e..111289c767f35c 100644 --- a/Doc/library/tokenize.rst +++ b/Doc/library/tokenize.rst @@ -57,6 +57,16 @@ The primary entry point is a :term:`generator`: :func:`.tokenize` determines the source encoding of the file by looking for a UTF-8 BOM or encoding cookie, according to :pep:`263`. +.. function:: generate_tokens(readline) + + Tokenize a source reading unicode strings instead of bytes. + + Like :func:`.tokenize`, the *readline* argument is a callable returning + a single line of input. However, :func:`generate_tokens` expects *readline* + to return a str object rather than bytes. + + The result is an iterator yielding named tuples, exactly like + :func:`.tokenize`. It does not yield an :data:`~token.ENCODING` token. All constants from the :mod:`token` module are also exported from :mod:`tokenize`. @@ -79,7 +89,8 @@ write back the modified script. positions) may change. It returns bytes, encoded using the :data:`~token.ENCODING` token, which - is the first token sequence output by :func:`.tokenize`. + is the first token sequence output by :func:`.tokenize`. If there is no + encoding token in the input, it returns a str instead. :func:`.tokenize` needs to detect the encoding of source files it tokenizes. The diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 3520a67bd42b11..93e40de96e9eb2 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,8 +1,8 @@ from test import support from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, - open as tokenize_open, Untokenizer) -from io import BytesIO + open as tokenize_open, Untokenizer, generate_tokens) +from io import BytesIO, StringIO import unittest from unittest import TestCase, mock from test.test_grammar import (VALID_UNDERSCORE_LITERALS, @@ -919,6 +919,19 @@ async def bar(): pass DEDENT '' (7, 0) (7, 0) """) +class GenerateTokensTest(TokenizeTest): + def check_tokenize(self, s, expected): + # Format the tokens in s in a table format. + # The ENDMARKER is omitted. + result = [] + f = StringIO(s) + for type, token, start, end, line in generate_tokens(f.readline): + if type == ENDMARKER: + break + type = tok_name[type] + result.append(f" {type:10} {token!r:13} {start} {end}") + self.assertEqual(result, expected.rstrip().splitlines()) + def decistmt(s): result = [] diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 40e6a8b9297b24..c78d9f7e9ee5af 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -37,7 +37,7 @@ blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token -__all__ = token.__all__ + ["tokenize", "detect_encoding", +__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", "untokenize", "TokenInfo"] del token @@ -653,9 +653,12 @@ def _tokenize(readline, encoding): yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') -# An undocumented, backwards compatible, API for all the places in the standard -# library that expect to be able to use tokenize with strings def generate_tokens(readline): + """Tokenize a source reading Python code as unicode strings. + + This has the same API as tokenize(), except that it expects the *readline* + callable to return str objects instead of bytes. + """ return _tokenize(readline, None) def main(): diff --git a/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst b/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst new file mode 100644 index 00000000000000..89c88e27373b5b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst @@ -0,0 +1,2 @@ +:func:`tokenize.generate_tokens` is now documented as a public API to +tokenize unicode strings. It was previously present but undocumented.