diff --git a/.gitattributes b/.gitattributes
index 4a487c3c2a14e5..274c2da3ab85b5 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -54,3 +54,6 @@ Include/opcode.h linguist-generated=true
Python/opcode_targets.h linguist-generated=true
Objects/typeslots.inc linguist-generated=true
Modules/unicodedata_db.h linguist-generated=true
+Doc/library/token-list.inc linguist-generated=true
+Include/token.h linguist-generated=true
+Parser/token.c linguist-generated=true
diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc
new file mode 100644
index 00000000000000..cd6e0f26968eea
--- /dev/null
+++ b/Doc/library/token-list.inc
@@ -0,0 +1,206 @@
+.. Auto-generated by Tools/scripts/generate_token.py
+.. data:: ENDMARKER
+
+.. data:: NAME
+
+.. data:: NUMBER
+
+.. data:: STRING
+
+.. data:: NEWLINE
+
+.. data:: INDENT
+
+.. data:: DEDENT
+
+.. data:: LPAR
+
+ Token value for ``"("``.
+
+.. data:: RPAR
+
+ Token value for ``")"``.
+
+.. data:: LSQB
+
+ Token value for ``"["``.
+
+.. data:: RSQB
+
+ Token value for ``"]"``.
+
+.. data:: COLON
+
+ Token value for ``":"``.
+
+.. data:: COMMA
+
+ Token value for ``","``.
+
+.. data:: SEMI
+
+ Token value for ``";"``.
+
+.. data:: PLUS
+
+ Token value for ``"+"``.
+
+.. data:: MINUS
+
+ Token value for ``"-"``.
+
+.. data:: STAR
+
+ Token value for ``"*"``.
+
+.. data:: SLASH
+
+ Token value for ``"/"``.
+
+.. data:: VBAR
+
+ Token value for ``"|"``.
+
+.. data:: AMPER
+
+ Token value for ``"&"``.
+
+.. data:: LESS
+
+ Token value for ``"<"``.
+
+.. data:: GREATER
+
+ Token value for ``">"``.
+
+.. data:: EQUAL
+
+ Token value for ``"="``.
+
+.. data:: DOT
+
+ Token value for ``"."``.
+
+.. data:: PERCENT
+
+ Token value for ``"%"``.
+
+.. data:: LBRACE
+
+ Token value for ``"{"``.
+
+.. data:: RBRACE
+
+ Token value for ``"}"``.
+
+.. data:: EQEQUAL
+
+ Token value for ``"=="``.
+
+.. data:: NOTEQUAL
+
+ Token value for ``"!="``.
+
+.. data:: LESSEQUAL
+
+ Token value for ``"<="``.
+
+.. data:: GREATEREQUAL
+
+ Token value for ``">="``.
+
+.. data:: TILDE
+
+ Token value for ``"~"``.
+
+.. data:: CIRCUMFLEX
+
+ Token value for ``"^"``.
+
+.. data:: LEFTSHIFT
+
+ Token value for ``"<<"``.
+
+.. data:: RIGHTSHIFT
+
+ Token value for ``">>"``.
+
+.. data:: DOUBLESTAR
+
+ Token value for ``"**"``.
+
+.. data:: PLUSEQUAL
+
+ Token value for ``"+="``.
+
+.. data:: MINEQUAL
+
+ Token value for ``"-="``.
+
+.. data:: STAREQUAL
+
+ Token value for ``"*="``.
+
+.. data:: SLASHEQUAL
+
+ Token value for ``"/="``.
+
+.. data:: PERCENTEQUAL
+
+ Token value for ``"%="``.
+
+.. data:: AMPEREQUAL
+
+ Token value for ``"&="``.
+
+.. data:: VBAREQUAL
+
+ Token value for ``"|="``.
+
+.. data:: CIRCUMFLEXEQUAL
+
+ Token value for ``"^="``.
+
+.. data:: LEFTSHIFTEQUAL
+
+ Token value for ``"<<="``.
+
+.. data:: RIGHTSHIFTEQUAL
+
+ Token value for ``">>="``.
+
+.. data:: DOUBLESTAREQUAL
+
+ Token value for ``"**="``.
+
+.. data:: DOUBLESLASH
+
+ Token value for ``"//"``.
+
+.. data:: DOUBLESLASHEQUAL
+
+ Token value for ``"//="``.
+
+.. data:: AT
+
+ Token value for ``"@"``.
+
+.. data:: ATEQUAL
+
+ Token value for ``"@="``.
+
+.. data:: RARROW
+
+ Token value for ``"->"``.
+
+.. data:: ELLIPSIS
+
+ Token value for ``"..."``.
+
+.. data:: OP
+
+.. data:: ERRORTOKEN
+
+.. data:: N_TOKENS
+
+.. data:: NT_OFFSET
diff --git a/Doc/library/token.rst b/Doc/library/token.rst
index 373991027e4ca9..5358eb5a291e63 100644
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -44,64 +44,7 @@ functions. The functions mirror definitions in the Python C header files.
The token constants are:
-.. data:: ENDMARKER
- NAME
- NUMBER
- STRING
- NEWLINE
- INDENT
- DEDENT
- LPAR
- RPAR
- LSQB
- RSQB
- COLON
- COMMA
- SEMI
- PLUS
- MINUS
- STAR
- SLASH
- VBAR
- AMPER
- LESS
- GREATER
- EQUAL
- DOT
- PERCENT
- LBRACE
- RBRACE
- EQEQUAL
- NOTEQUAL
- LESSEQUAL
- GREATEREQUAL
- TILDE
- CIRCUMFLEX
- LEFTSHIFT
- RIGHTSHIFT
- DOUBLESTAR
- PLUSEQUAL
- MINEQUAL
- STAREQUAL
- SLASHEQUAL
- PERCENTEQUAL
- AMPEREQUAL
- VBAREQUAL
- CIRCUMFLEXEQUAL
- LEFTSHIFTEQUAL
- RIGHTSHIFTEQUAL
- DOUBLESTAREQUAL
- DOUBLESLASH
- DOUBLESLASHEQUAL
- AT
- ATEQUAL
- RARROW
- ELLIPSIS
- OP
- ERRORTOKEN
- N_TOKENS
- NT_OFFSET
-
+.. include:: token-list.inc
The following token type values aren't used by the C tokenizer but are needed for
the :mod:`tokenize` module.
diff --git a/Include/token.h b/Include/token.h
index cd1cd00f09c460..2d491e6927d1a9 100644
--- a/Include/token.h
+++ b/Include/token.h
@@ -1,3 +1,4 @@
+/* Auto-generated by Tools/scripts/generate_token.py */
/* Token types */
#ifndef Py_LIMITED_API
@@ -62,25 +63,19 @@ extern "C" {
#define ATEQUAL 50
#define RARROW 51
#define ELLIPSIS 52
-/* Don't forget to update the table _PyParser_TokenNames in tokenizer.c! */
#define OP 53
#define ERRORTOKEN 54
-/* These aren't used by the C tokenizer but are needed for tokenize.py */
-#define COMMENT 55
-#define NL 56
-#define ENCODING 57
#define N_TOKENS 58
+#define NT_OFFSET 256
/* Special definitions for cooperation with parser */
-#define NT_OFFSET 256
-
#define ISTERMINAL(x) ((x) < NT_OFFSET)
#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
#define ISEOF(x) ((x) == ENDMARKER)
-PyAPI_DATA(const char *) _PyParser_TokenNames[]; /* Token names */
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) PyToken_OneChar(int);
PyAPI_FUNC(int) PyToken_TwoChars(int, int);
PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
diff --git a/Lib/symbol.py b/Lib/symbol.py
old mode 100755
new mode 100644
index dc7dcba5e4d0b5..664a2a4932da5c
--- a/Lib/symbol.py
+++ b/Lib/symbol.py
@@ -1,13 +1,6 @@
-#! /usr/bin/env python3
-
"""Non-terminal symbols of Python grammar (from "graminit.h")."""
# This file is automatically generated; please don't muck it up!
-#
-# To update the symbols in this file, 'cd' to the top directory of
-# the python source tree after building the interpreter and run:
-#
-# ./python Lib/symbol.py
#--start constants--
single_input = 256
@@ -103,14 +96,4 @@
for _name, _value in list(globals().items()):
if type(_value) is type(0):
sym_name[_value] = _name
-
-
-def _main():
- import sys
- import token
- if len(sys.argv) == 1:
- sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"]
- token._main()
-
-if __name__ == "__main__":
- _main()
+del _name, _value
diff --git a/Lib/test/test_symbol.py b/Lib/test/test_symbol.py
index c1306f54327f4e..ed86aec36b873c 100644
--- a/Lib/test/test_symbol.py
+++ b/Lib/test/test_symbol.py
@@ -6,6 +6,9 @@
SYMBOL_FILE = support.findfile('symbol.py')
+GEN_SYMBOL_FILE = os.path.join(os.path.dirname(__file__),
+ '..', '..', 'Tools', 'scripts',
+ 'generate_symbol_py.py')
GRAMMAR_FILE = os.path.join(os.path.dirname(__file__),
'..', '..', 'Include', 'graminit.h')
TEST_PY_FILE = 'symbol_test.py'
@@ -22,7 +25,7 @@ def _copy_file_without_generated_symbols(self, source_file, dest_file):
def _generate_symbols(self, grammar_file, target_symbol_py_file):
proc = subprocess.Popen([sys.executable,
- SYMBOL_FILE,
+ GEN_SYMBOL_FILE,
grammar_file,
target_symbol_py_file], stderr=subprocess.PIPE)
stderr = proc.communicate()[1]
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index ff1447954943d9..04a12542c6ae25 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1619,6 +1619,8 @@ def test_random_files(self):
testfiles = random.sample(testfiles, 10)
for testfile in testfiles:
+ if support.verbose >= 2:
+ print('tokenize', testfile)
with open(testfile, 'rb') as f:
with self.subTest(file=testfile):
self.check_roundtrip(f)
diff --git a/Lib/token.py b/Lib/token.py
index ba132059abf5ee..63ebd36df7c28a 100644
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -1,84 +1,87 @@
-"""Token constants (from "token.h")."""
+"""Token constants."""
__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
-# This file is automatically generated; please don't muck it up!
-#
-# To update the symbols in this file, 'cd' to the top directory of
-# the python source tree after building the interpreter and run:
-#
-# ./python Lib/token.py
+tokens = [
+ 'ENDMARKER',
+ 'NAME',
+ 'NUMBER',
+ 'STRING',
+ 'NEWLINE',
+ 'INDENT',
+ 'DEDENT',
+
+ ('LPAR', '('),
+ ('RPAR', ')'),
+ ('LSQB', '['),
+ ('RSQB', ']'),
+ ('COLON', ':'),
+ ('COMMA', ','),
+ ('SEMI', ';'),
+ ('PLUS', '+'),
+ ('MINUS', '-'),
+ ('STAR', '*'),
+ ('SLASH', '/'),
+ ('VBAR', '|'),
+ ('AMPER', '&'),
+ ('LESS', '<'),
+ ('GREATER', '>'),
+ ('EQUAL', '='),
+ ('DOT', '.'),
+ ('PERCENT', '%'),
+ ('LBRACE', '{'),
+ ('RBRACE', '}'),
+ ('EQEQUAL', '=='),
+ ('NOTEQUAL', '!='),
+ ('LESSEQUAL', '<='),
+ ('GREATEREQUAL', '>='),
+ ('TILDE', '~'),
+ ('CIRCUMFLEX', '^'),
+ ('LEFTSHIFT', '<<'),
+ ('RIGHTSHIFT', '>>'),
+ ('DOUBLESTAR', '**'),
+ ('PLUSEQUAL', '+='),
+ ('MINEQUAL', '-='),
+ ('STAREQUAL', '*='),
+ ('SLASHEQUAL', '/='),
+ ('PERCENTEQUAL', '%='),
+ ('AMPEREQUAL', '&='),
+ ('VBAREQUAL', '|='),
+ ('CIRCUMFLEXEQUAL', '^='),
+ ('LEFTSHIFTEQUAL', '<<='),
+ ('RIGHTSHIFTEQUAL', '>>='),
+ ('DOUBLESTAREQUAL', '**='),
+ ('DOUBLESLASH', '//'),
+ ('DOUBLESLASHEQUAL', '//='),
+ ('AT', '@'),
+ ('ATEQUAL', '@='),
+ ('RARROW', '->'),
+ ('ELLIPSIS', '...'),
+
+ 'OP',
+ 'ERRORTOKEN',
+
+ # These aren't used by the C tokenizer but are needed for tokenize.py
+ 'COMMENT',
+ 'NL',
+ 'ENCODING',
+
+ 'N_TOKENS',
+]
+
+tok_name = {i: v[0] if isinstance(v, tuple) else v
+ for i, v in enumerate(tokens)}
+EXACT_TOKEN_TYPES = {x: i
+ for i, v in enumerate(tokens)
+ if isinstance(v, tuple)
+ for x in v[1:]}
+del tokens
-#--start constants--
-ENDMARKER = 0
-NAME = 1
-NUMBER = 2
-STRING = 3
-NEWLINE = 4
-INDENT = 5
-DEDENT = 6
-LPAR = 7
-RPAR = 8
-LSQB = 9
-RSQB = 10
-COLON = 11
-COMMA = 12
-SEMI = 13
-PLUS = 14
-MINUS = 15
-STAR = 16
-SLASH = 17
-VBAR = 18
-AMPER = 19
-LESS = 20
-GREATER = 21
-EQUAL = 22
-DOT = 23
-PERCENT = 24
-LBRACE = 25
-RBRACE = 26
-EQEQUAL = 27
-NOTEQUAL = 28
-LESSEQUAL = 29
-GREATEREQUAL = 30
-TILDE = 31
-CIRCUMFLEX = 32
-LEFTSHIFT = 33
-RIGHTSHIFT = 34
-DOUBLESTAR = 35
-PLUSEQUAL = 36
-MINEQUAL = 37
-STAREQUAL = 38
-SLASHEQUAL = 39
-PERCENTEQUAL = 40
-AMPEREQUAL = 41
-VBAREQUAL = 42
-CIRCUMFLEXEQUAL = 43
-LEFTSHIFTEQUAL = 44
-RIGHTSHIFTEQUAL = 45
-DOUBLESTAREQUAL = 46
-DOUBLESLASH = 47
-DOUBLESLASHEQUAL = 48
-AT = 49
-ATEQUAL = 50
-RARROW = 51
-ELLIPSIS = 52
-# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
-OP = 53
-ERRORTOKEN = 54
-# These aren't used by the C tokenizer but are needed for tokenize.py
-COMMENT = 55
-NL = 56
-ENCODING = 57
-N_TOKENS = 58
# Special definitions for cooperation with parser
-NT_OFFSET = 256
-#--end constants--
+tok_name[256] = 'NT_OFFSET'
-tok_name = {value: name
- for name, value in globals().items()
- if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values())
+globals().update({name: value for value, name in tok_name.items()})
def ISTERMINAL(x):
return x < NT_OFFSET
@@ -88,73 +91,3 @@ def ISNONTERMINAL(x):
def ISEOF(x):
return x == ENDMARKER
-
-
-def _main():
- import re
- import sys
- args = sys.argv[1:]
- inFileName = args and args[0] or "Include/token.h"
- outFileName = "Lib/token.py"
- if len(args) > 1:
- outFileName = args[1]
- try:
- fp = open(inFileName)
- except OSError as err:
- sys.stdout.write("I/O error: %s\n" % str(err))
- sys.exit(1)
- with fp:
- lines = fp.read().split("\n")
- prog = re.compile(
- r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
- re.IGNORECASE)
- comment_regex = re.compile(
- r"^\s*/\*\s*(.+?)\s*\*/\s*$",
- re.IGNORECASE)
-
- tokens = {}
- prev_val = None
- for line in lines:
- match = prog.match(line)
- if match:
- name, val = match.group(1, 2)
- val = int(val)
- tokens[val] = {'token': name} # reverse so we can sort them...
- prev_val = val
- else:
- comment_match = comment_regex.match(line)
- if comment_match and prev_val is not None:
- comment = comment_match.group(1)
- tokens[prev_val]['comment'] = comment
- keys = sorted(tokens.keys())
- # load the output skeleton from the target:
- try:
- fp = open(outFileName)
- except OSError as err:
- sys.stderr.write("I/O error: %s\n" % str(err))
- sys.exit(2)
- with fp:
- format = fp.read().split("\n")
- try:
- start = format.index("#--start constants--") + 1
- end = format.index("#--end constants--")
- except ValueError:
- sys.stderr.write("target does not contain format markers")
- sys.exit(3)
- lines = []
- for key in keys:
- lines.append("%s = %d" % (tokens[key]["token"], key))
- if "comment" in tokens[key]:
- lines.append("# %s" % tokens[key]["comment"])
- format[start:end] = lines
- try:
- fp = open(outFileName, 'w')
- except OSError as err:
- sys.stderr.write("I/O error: %s\n" % str(err))
- sys.exit(4)
- with fp:
- fp.write("\n".join(format))
-
-
-if __name__ == "__main__":
- _main()
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index fce010bc5e7aa7..cf1ecc99a94438 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -32,6 +32,7 @@
import re
import sys
from token import *
+from token import EXACT_TOKEN_TYPES
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -41,55 +42,6 @@
"untokenize", "TokenInfo"]
del token
-EXACT_TOKEN_TYPES = {
- '(': LPAR,
- ')': RPAR,
- '[': LSQB,
- ']': RSQB,
- ':': COLON,
- ',': COMMA,
- ';': SEMI,
- '+': PLUS,
- '-': MINUS,
- '*': STAR,
- '/': SLASH,
- '|': VBAR,
- '&': AMPER,
- '<': LESS,
- '>': GREATER,
- '=': EQUAL,
- '.': DOT,
- '%': PERCENT,
- '{': LBRACE,
- '}': RBRACE,
- '==': EQEQUAL,
- '!=': NOTEQUAL,
- '<=': LESSEQUAL,
- '>=': GREATEREQUAL,
- '~': TILDE,
- '^': CIRCUMFLEX,
- '<<': LEFTSHIFT,
- '>>': RIGHTSHIFT,
- '**': DOUBLESTAR,
- '+=': PLUSEQUAL,
- '-=': MINEQUAL,
- '*=': STAREQUAL,
- '/=': SLASHEQUAL,
- '%=': PERCENTEQUAL,
- '&=': AMPEREQUAL,
- '|=': VBAREQUAL,
- '^=': CIRCUMFLEXEQUAL,
- '<<=': LEFTSHIFTEQUAL,
- '>>=': RIGHTSHIFTEQUAL,
- '**=': DOUBLESTAREQUAL,
- '//': DOUBLESLASH,
- '//=': DOUBLESLASHEQUAL,
- '...': ELLIPSIS,
- '->': RARROW,
- '@': AT,
- '@=': ATEQUAL,
-}
-
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
@@ -163,17 +115,11 @@ def _compile(expr):
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
- r"//=?", r"->",
- r"[+\-*/%&@|^=<>]=?",
- r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
+# Sorting in reverse order puts the long operators before their prefixes.
+# Otherwise if = came before ==, == would get recognized as two instances
+# of =.
+Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
+Funny = group(r'\r?\n', Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 87a84eb68083ae..834e4122feef09 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -295,6 +295,7 @@ POBJS= \
Parser/metagrammar.o \
Parser/firstsets.o \
Parser/grammar.o \
+ Parser/token.o \
Parser/pgen.o
PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
@@ -552,7 +553,7 @@ coverage-lcov:
@echo
# Force regeneration of parser and importlib
-coverage-report: regen-grammar regen-importlib
+coverage-report: regen-grammar regen-token regen-importlib
@ # build with coverage info
$(MAKE) coverage
@ # run tests, ignore failures
@@ -734,7 +735,7 @@ regen-importlib: Programs/_freeze_importlib
# Regenerate all generated files
regen-all: regen-opcode regen-opcode-targets regen-typeslots regen-grammar \
- regen-ast regen-importlib clinic
+ regen-token regen-symbol regen-ast regen-importlib clinic
############################################################################
# Special rules for object files
@@ -842,6 +843,32 @@ regen-opcode:
$(srcdir)/Include/opcode.h.new
$(UPDATE_FILE) $(srcdir)/Include/opcode.h $(srcdir)/Include/opcode.h.new
+.PHONY: regen-token
+regen-token:
+ # Regenerate Doc/library/token-list.inc from Lib/token.py
+ # using Tools/scripts/generate_token.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py rst \
+ $(srcdir)/Lib/token.py \
+ $(srcdir)/Doc/library/token-list.inc
+ # Regenerate Include/token.h from Lib/token.py
+ # using Tools/scripts/generate_token.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py h \
+ $(srcdir)/Lib/token.py \
+ $(srcdir)/Include/token.h
+ # Regenerate Parser/token.c from Lib/token.py
+ # using Tools/scripts/generate_token.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py c \
+ $(srcdir)/Lib/token.py \
+ $(srcdir)/Parser/token.c
+
+.PHONY: regen-symbol
+regen-symbol: $(srcdir)/Include/graminit.h
+ # Regenerate Lib/symbol.py from Include/graminit.h
+ # using Tools/scripts/generate_symbol_py.py
+ $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_symbol_py.py \
+ $(srcdir)/Include/graminit.h \
+ $(srcdir)/Lib/symbol.py
+
Python/compile.o Python/symtable.o Python/ast_unparse.o Python/ast.o: $(srcdir)/Include/graminit.h $(srcdir)/Include/Python-ast.h
Python/getplatform.o: $(srcdir)/Python/getplatform.c
diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst b/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst
new file mode 100644
index 00000000000000..2694fc22afb4de
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst
@@ -0,0 +1,2 @@
+The C code and the documentation related to tokens are now generated from
+token.py.
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 0ae24fade13db6..52cf405b5c9c2e 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -355,6 +355,7 @@
+
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index ef5ef7268a39ac..153bd809b8f463 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -830,6 +830,9 @@
Parser
+
+ Parser
+
PC
diff --git a/Parser/token.c b/Parser/token.c
new file mode 100644
index 00000000000000..35519aa4b61161
--- /dev/null
+++ b/Parser/token.c
@@ -0,0 +1,233 @@
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+#include "Python.h"
+#include "token.h"
+
+/* Token names */
+
+const char * const _PyParser_TokenNames[] = {
+ "ENDMARKER",
+ "NAME",
+ "NUMBER",
+ "STRING",
+ "NEWLINE",
+ "INDENT",
+ "DEDENT",
+ "LPAR",
+ "RPAR",
+ "LSQB",
+ "RSQB",
+ "COLON",
+ "COMMA",
+ "SEMI",
+ "PLUS",
+ "MINUS",
+ "STAR",
+ "SLASH",
+ "VBAR",
+ "AMPER",
+ "LESS",
+ "GREATER",
+ "EQUAL",
+ "DOT",
+ "PERCENT",
+ "LBRACE",
+ "RBRACE",
+ "EQEQUAL",
+ "NOTEQUAL",
+ "LESSEQUAL",
+ "GREATEREQUAL",
+ "TILDE",
+ "CIRCUMFLEX",
+ "LEFTSHIFT",
+ "RIGHTSHIFT",
+ "DOUBLESTAR",
+ "PLUSEQUAL",
+ "MINEQUAL",
+ "STAREQUAL",
+ "SLASHEQUAL",
+ "PERCENTEQUAL",
+ "AMPEREQUAL",
+ "VBAREQUAL",
+ "CIRCUMFLEXEQUAL",
+ "LEFTSHIFTEQUAL",
+ "RIGHTSHIFTEQUAL",
+ "DOUBLESTAREQUAL",
+ "DOUBLESLASH",
+ "DOUBLESLASHEQUAL",
+ "AT",
+ "ATEQUAL",
+ "RARROW",
+ "ELLIPSIS",
+ "OP",
+ "",
+ "",
+ "",
+ "",
+ "",
+};
+
+/* Return the token corresponding to a single character */
+
+int
+PyToken_OneChar(int c1)
+{
+ switch (c1) {
+ case '%': return PERCENT;
+ case '&': return AMPER;
+ case '(': return LPAR;
+ case ')': return RPAR;
+ case '*': return STAR;
+ case '+': return PLUS;
+ case ',': return COMMA;
+ case '-': return MINUS;
+ case '.': return DOT;
+ case '/': return SLASH;
+ case ':': return COLON;
+ case ';': return SEMI;
+ case '<': return LESS;
+ case '=': return EQUAL;
+ case '>': return GREATER;
+ case '@': return AT;
+ case '[': return LSQB;
+ case ']': return RSQB;
+ case '^': return CIRCUMFLEX;
+ case '{': return LBRACE;
+ case '|': return VBAR;
+ case '}': return RBRACE;
+ case '~': return TILDE;
+ }
+ return OP;
+}
+
+int
+PyToken_TwoChars(int c1, int c2)
+{
+ switch (c1) {
+ case '!':
+ switch (c2) {
+ case '=': return NOTEQUAL;
+ }
+ break;
+ case '%':
+ switch (c2) {
+ case '=': return PERCENTEQUAL;
+ }
+ break;
+ case '&':
+ switch (c2) {
+ case '=': return AMPEREQUAL;
+ }
+ break;
+ case '*':
+ switch (c2) {
+ case '*': return DOUBLESTAR;
+ case '=': return STAREQUAL;
+ }
+ break;
+ case '+':
+ switch (c2) {
+ case '=': return PLUSEQUAL;
+ }
+ break;
+ case '-':
+ switch (c2) {
+ case '=': return MINEQUAL;
+ case '>': return RARROW;
+ }
+ break;
+ case '/':
+ switch (c2) {
+ case '/': return DOUBLESLASH;
+ case '=': return SLASHEQUAL;
+ }
+ break;
+ case '<':
+ switch (c2) {
+ case '<': return LEFTSHIFT;
+ case '=': return LESSEQUAL;
+ case '>': return NOTEQUAL;
+ }
+ break;
+ case '=':
+ switch (c2) {
+ case '=': return EQEQUAL;
+ }
+ break;
+ case '>':
+ switch (c2) {
+ case '=': return GREATEREQUAL;
+ case '>': return RIGHTSHIFT;
+ }
+ break;
+ case '@':
+ switch (c2) {
+ case '=': return ATEQUAL;
+ }
+ break;
+ case '^':
+ switch (c2) {
+ case '=': return CIRCUMFLEXEQUAL;
+ }
+ break;
+ case '|':
+ switch (c2) {
+ case '=': return VBAREQUAL;
+ }
+ break;
+ }
+ return OP;
+}
+
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
+{
+ switch (c1) {
+ case '*':
+ switch (c2) {
+ case '*':
+ switch (c3) {
+ case '=': return DOUBLESTAREQUAL;
+ }
+ break;
+ }
+ break;
+ case '.':
+ switch (c2) {
+ case '.':
+ switch (c3) {
+ case '.': return ELLIPSIS;
+ }
+ break;
+ }
+ break;
+ case '/':
+ switch (c2) {
+ case '/':
+ switch (c3) {
+ case '=': return DOUBLESLASHEQUAL;
+ }
+ break;
+ }
+ break;
+ case '<':
+ switch (c2) {
+ case '<':
+ switch (c3) {
+ case '=': return LEFTSHIFTEQUAL;
+ }
+ break;
+ }
+ break;
+ case '>':
+ switch (c2) {
+ case '>':
+ switch (c3) {
+ case '=': return RIGHTSHIFTEQUAL;
+ }
+ break;
+ }
+ break;
+ }
+ return OP;
+}
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index fc75bae5376609..126c709b636f9c 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -48,72 +48,6 @@ static int tok_nextc(struct tok_state *tok);
static void tok_backup(struct tok_state *tok, int c);
-/* Token names */
-
-const char *_PyParser_TokenNames[] = {
- "ENDMARKER",
- "NAME",
- "NUMBER",
- "STRING",
- "NEWLINE",
- "INDENT",
- "DEDENT",
- "LPAR",
- "RPAR",
- "LSQB",
- "RSQB",
- "COLON",
- "COMMA",
- "SEMI",
- "PLUS",
- "MINUS",
- "STAR",
- "SLASH",
- "VBAR",
- "AMPER",
- "LESS",
- "GREATER",
- "EQUAL",
- "DOT",
- "PERCENT",
- "LBRACE",
- "RBRACE",
- "EQEQUAL",
- "NOTEQUAL",
- "LESSEQUAL",
- "GREATEREQUAL",
- "TILDE",
- "CIRCUMFLEX",
- "LEFTSHIFT",
- "RIGHTSHIFT",
- "DOUBLESTAR",
- "PLUSEQUAL",
- "MINEQUAL",
- "STAREQUAL",
- "SLASHEQUAL",
- "PERCENTEQUAL",
- "AMPEREQUAL",
- "VBAREQUAL",
- "CIRCUMFLEXEQUAL",
- "LEFTSHIFTEQUAL",
- "RIGHTSHIFTEQUAL",
- "DOUBLESTAREQUAL",
- "DOUBLESLASH",
- "DOUBLESLASHEQUAL",
- "AT",
- "ATEQUAL",
- "RARROW",
- "ELLIPSIS",
- /* This table must match the #defines in token.h! */
- "OP",
- "",
- "COMMENT",
- "NL",
- "ENCODING",
- ""
-};
-
-
/* Create and initialize a new tok_state structure */
static struct tok_state *
@@ -1109,177 +1043,6 @@ tok_backup(struct tok_state *tok, int c)
}
-/* Return the token corresponding to a single character */
-
-int
-PyToken_OneChar(int c)
-{
- switch (c) {
- case '(': return LPAR;
- case ')': return RPAR;
- case '[': return LSQB;
- case ']': return RSQB;
- case ':': return COLON;
- case ',': return COMMA;
- case ';': return SEMI;
- case '+': return PLUS;
- case '-': return MINUS;
- case '*': return STAR;
- case '/': return SLASH;
- case '|': return VBAR;
- case '&': return AMPER;
- case '<': return LESS;
- case '>': return GREATER;
- case '=': return EQUAL;
- case '.': return DOT;
- case '%': return PERCENT;
- case '{': return LBRACE;
- case '}': return RBRACE;
- case '^': return CIRCUMFLEX;
- case '~': return TILDE;
- case '@': return AT;
- default: return OP;
- }
-}
-
-
-int
-PyToken_TwoChars(int c1, int c2)
-{
- switch (c1) {
- case '=':
- switch (c2) {
- case '=': return EQEQUAL;
- }
- break;
- case '!':
- switch (c2) {
- case '=': return NOTEQUAL;
- }
- break;
- case '<':
- switch (c2) {
- case '>': return NOTEQUAL;
- case '=': return LESSEQUAL;
- case '<': return LEFTSHIFT;
- }
- break;
- case '>':
- switch (c2) {
- case '=': return GREATEREQUAL;
- case '>': return RIGHTSHIFT;
- }
- break;
- case '+':
- switch (c2) {
- case '=': return PLUSEQUAL;
- }
- break;
- case '-':
- switch (c2) {
- case '=': return MINEQUAL;
- case '>': return RARROW;
- }
- break;
- case '*':
- switch (c2) {
- case '*': return DOUBLESTAR;
- case '=': return STAREQUAL;
- }
- break;
- case '/':
- switch (c2) {
- case '/': return DOUBLESLASH;
- case '=': return SLASHEQUAL;
- }
- break;
- case '|':
- switch (c2) {
- case '=': return VBAREQUAL;
- }
- break;
- case '%':
- switch (c2) {
- case '=': return PERCENTEQUAL;
- }
- break;
- case '&':
- switch (c2) {
- case '=': return AMPEREQUAL;
- }
- break;
- case '^':
- switch (c2) {
- case '=': return CIRCUMFLEXEQUAL;
- }
- break;
- case '@':
- switch (c2) {
- case '=': return ATEQUAL;
- }
- break;
- }
- return OP;
-}
-
-int
-PyToken_ThreeChars(int c1, int c2, int c3)
-{
- switch (c1) {
- case '<':
- switch (c2) {
- case '<':
- switch (c3) {
- case '=':
- return LEFTSHIFTEQUAL;
- }
- break;
- }
- break;
- case '>':
- switch (c2) {
- case '>':
- switch (c3) {
- case '=':
- return RIGHTSHIFTEQUAL;
- }
- break;
- }
- break;
- case '*':
- switch (c2) {
- case '*':
- switch (c3) {
- case '=':
- return DOUBLESTAREQUAL;
- }
- break;
- }
- break;
- case '/':
- switch (c2) {
- case '/':
- switch (c3) {
- case '=':
- return DOUBLESLASHEQUAL;
- }
- break;
- }
- break;
- case '.':
- switch (c2) {
- case '.':
- switch (c3) {
- case '.':
- return ELLIPSIS;
- }
- break;
- }
- break;
- }
- return OP;
-}
-
static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
diff --git a/Tools/scripts/generate_symbol_py.py b/Tools/scripts/generate_symbol_py.py
new file mode 100755
index 00000000000000..9219b096e4d67d
--- /dev/null
+++ b/Tools/scripts/generate_symbol_py.py
@@ -0,0 +1,53 @@
+#! /usr/bin/env python3
+# This script generates the symbol.py source file.
+
+import sys
+import re
+
+def main(inFileName="Include/graminit.h", outFileName="Lib/symbol.py"):
+ try:
+ fp = open(inFileName)
+ except OSError as err:
+ sys.stderr.write("I/O error: %s\n" % str(err))
+ sys.exit(1)
+ with fp:
+ lines = fp.read().split("\n")
+ prog = re.compile(
+ "#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
+ re.IGNORECASE)
+ tokens = {}
+ for line in lines:
+ match = prog.match(line)
+ if match:
+ name, val = match.group(1, 2)
+ val = int(val)
+ tokens[val] = name # reverse so we can sort them...
+ keys = sorted(tokens.keys())
+ # load the output skeleton from the target:
+ try:
+ fp = open(outFileName)
+ except OSError as err:
+ sys.stderr.write("I/O error: %s\n" % str(err))
+ sys.exit(2)
+ with fp:
+ format = fp.read().split("\n")
+ try:
+ start = format.index("#--start constants--") + 1
+ end = format.index("#--end constants--")
+ except ValueError:
+ sys.stderr.write("target does not contain format markers")
+ sys.exit(3)
+ lines = []
+ for val in keys:
+ lines.append("%s = %d" % (tokens[val], val))
+ format[start:end] = lines
+ try:
+ fp = open(outFileName, 'w')
+ except OSError as err:
+ sys.stderr.write("I/O error: %s\n" % str(err))
+ sys.exit(4)
+ with fp:
+ fp.write("\n".join(format))
+
+if __name__ == '__main__':
+ main(*sys.argv[1:])
diff --git a/Tools/scripts/generate_token.py b/Tools/scripts/generate_token.py
new file mode 100644
index 00000000000000..2ac931771f112a
--- /dev/null
+++ b/Tools/scripts/generate_token.py
@@ -0,0 +1,205 @@
+#! /usr/bin/env python3
+# This script generates token related files from Grammar/Tokens:
+#
+# Doc/library/token-list.inc
+# Include/token.h
+# Parser/token.c
+# Lib/token.py
+
+
+def load_module(path):
+ module = type('Namespace', (), {})()
+ with open(path, 'rb') as fp:
+ code = fp.read()
+ exec(code, module.__dict__)
+ return module
+
+def load_tokens(path):
+ global NT_OFFSET
+ token = load_module(path)
+ tok_names = [token.tok_name[i] for i in range(token.N_TOKENS)]
+ NT_OFFSET = token.NT_OFFSET
+ ERRORTOKEN = token.ERRORTOKEN
+ string_to_tok = dict(token.EXACT_TOKEN_TYPES)
+ return tok_names, ERRORTOKEN, string_to_tok
+
+
+def update_file(file, content):
+ try:
+ with open(file, 'r') as fobj:
+ if fobj.read() == content:
+ return False
+ except FileNotFound:
+ return False
+ with open(file, 'w') as fobj:
+ fobj.write(content)
+ return True
+
+
+token_h_template = """\
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+/* Token types */
+#ifndef Py_LIMITED_API
+#ifndef Py_TOKEN_H
+#define Py_TOKEN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
+
+%s\
+#define N_TOKENS %d
+#define NT_OFFSET %d
+
+/* Special definitions for cooperation with parser */
+
+#define ISTERMINAL(x) ((x) < NT_OFFSET)
+#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
+#define ISEOF(x) ((x) == ENDMARKER)
+
+
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
+PyAPI_FUNC(int) PyToken_OneChar(int);
+PyAPI_FUNC(int) PyToken_TwoChars(int, int);
+PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TOKEN_H */
+#endif /* Py_LIMITED_API */
+"""
+
+def make_h(infile, outfile='Include/token.h'):
+ tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+
+ defines = []
+ for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
+ defines.append("#define %-15s %d\n" % (name, value))
+
+ if update_file(outfile, token_h_template % (
+ ''.join(defines),
+ len(tok_names),
+ NT_OFFSET
+ )):
+ print("%s regenerated from %s" % (outfile, infile), file=sys.stderr)
+
+
+token_c_template = """\
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+#include "Python.h"
+#include "token.h"
+
+/* Token names */
+
+const char * const _PyParser_TokenNames[] = {
+%s\
+};
+
+/* Return the token corresponding to a single character */
+
+int
+PyToken_OneChar(int c1)
+{
+%s\
+ return OP;
+}
+
+int
+PyToken_TwoChars(int c1, int c2)
+{
+%s\
+ return OP;
+}
+
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
+{
+%s\
+ return OP;
+}
+"""
+
+def generate_chars_to_token(mapping, n=1):
+ result = []
+ write = result.append
+ indent = ' ' * n
+ write(indent)
+ write('switch (c%d) {\n' % (n,))
+ for c in sorted(mapping):
+ write(indent)
+ value = mapping[c]
+ if isinstance(value, dict):
+ write("case '%s':\n" % (c,))
+ write(generate_chars_to_token(value, n + 1))
+ write(indent)
+ write(' break;\n')
+ else:
+ write("case '%s': return %s;\n" % (c, value))
+ write(indent)
+ write('}\n')
+ return ''.join(result)
+
+def make_c(infile, outfile='Parser/token.c'):
+ tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+ string_to_tok['<>'] = string_to_tok['!=']
+ chars_to_token = {}
+ for string, value in string_to_tok.items():
+ assert 1 <= len(string) <= 3
+ name = tok_names[value]
+ m = chars_to_token.setdefault(len(string), {})
+ for c in string[:-1]:
+ m = m.setdefault(c, {})
+ m[string[-1]] = name
+
+ names = []
+ for value, name in enumerate(tok_names):
+ if value >= ERRORTOKEN:
+ name = '<%s>' % name
+ names.append(' "%s",\n' % name)
+ names.append(' "",\n')
+
+ if update_file(outfile, token_c_template % (
+ ''.join(names),
+ generate_chars_to_token(chars_to_token[1]),
+ generate_chars_to_token(chars_to_token[2]),
+ generate_chars_to_token(chars_to_token[3])
+ )):
+ print("%s regenerated from %s" % (outfile, infile))
+
+
+token_inc_template = """\
+.. Auto-generated by Tools/scripts/generate_token.py
+%s
+.. data:: N_TOKENS
+
+.. data:: NT_OFFSET
+"""
+
+def make_rst(infile, outfile='Doc/library/token-list.inc'):
+ tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+ tok_to_string = {value: s for s, value in string_to_tok.items()}
+
+ names = []
+ for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
+ names.append('.. data:: %s' % (name,))
+ if value in tok_to_string:
+ names.append('')
+ names.append(' Token value for ``"%s"``.' % tok_to_string[value])
+ names.append('')
+
+ if update_file(outfile, token_inc_template % '\n'.join(names)):
+ print("%s regenerated from %s" % (outfile, infile))
+
+
+def main(op, infile='Lib/token.py', *args):
+ make = globals()['make_' + op]
+ make(infile, *args)
+
+
+if __name__ == '__main__':
+ import sys
+ main(*sys.argv[1:])