diff --git a/CHANGES.md b/CHANGES.md index af756c79621..c536bff107e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,8 @@ +- Add support for the new Python 3.12 f-string syntax introduced by PEP 701 (#3822) + ### Stable style diff --git a/src/black/__init__.py b/src/black/__init__.py index 6f0e128f56c..6ba49d5ef2d 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -69,13 +69,7 @@ from black.mode import FUTURE_FLAG_TO_FEATURE, VERSION_TO_FEATURES, Feature from black.mode import Mode as Mode # re-exported from black.mode import Preview, TargetVersion, supports_feature -from black.nodes import ( - STARS, - is_number_token, - is_simple_decorator_expression, - is_string_token, - syms, -) +from black.nodes import STARS, is_number_token, is_simple_decorator_expression, syms from black.output import color_diff, diff, dump_to_file, err, ipynb_diff, out from black.parsing import ( # noqa F401 ASTSafetyError, @@ -91,7 +85,6 @@ sanitized_lines, ) from black.report import Changed, NothingChanged, Report -from black.trans import iter_fexpr_spans from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node @@ -1265,7 +1258,10 @@ def _format_str_once( elt = EmptyLineTracker(mode=mode) split_line_features = { feature - for feature in {Feature.TRAILING_COMMA_IN_CALL, Feature.TRAILING_COMMA_IN_DEF} + for feature in { + Feature.TRAILING_COMMA_IN_CALL, + Feature.TRAILING_COMMA_IN_DEF, + } if supports_feature(versions, feature) } block: Optional[LinesBlock] = None @@ -1337,15 +1333,14 @@ def get_features_used( # noqa: C901 } for n in node.pre_order(): - if is_string_token(n): - value_head = n.value[:2] - if value_head in {'f"', 'F"', "f'", "F'", "rf", "fr", "RF", "FR"}: - features.add(Feature.F_STRINGS) - if Feature.DEBUG_F_STRINGS not in features: - for span_beg, span_end in iter_fexpr_spans(n.value): - if n.value[span_beg : span_end - 1].rstrip().endswith("="): - features.add(Feature.DEBUG_F_STRINGS) - break + if n.type == token.FSTRING_START: + features.add(Feature.F_STRINGS) + elif ( + n.type == token.RBRACE + and n.parent is not None + and any(child.type == token.EQUAL for child in n.parent.children) + ): + features.add(Feature.DEBUG_F_STRINGS) elif is_number_token(n): if "_" in n.value: diff --git a/src/black/linegen.py b/src/black/linegen.py index 0f545172795..4b29a049dba 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -502,6 +502,45 @@ def visit_NUMBER(self, leaf: Leaf) -> Iterator[Line]: normalize_numeric_literal(leaf) yield from self.visit_default(leaf) + def visit_fstring(self, node: Node) -> Iterator[Line]: + # currently we don't want to format and split f-strings at all. + string_leaf = _fstring_to_string(node) + node.replace(string_leaf) + yield from self.visit_STRING(string_leaf) + + # TODO: Uncomment Implementation to format f-string children + # fstring_start = node.children[0] + # fstring_end = node.children[-1] + # assert isinstance(fstring_start, Leaf) + # assert isinstance(fstring_end, Leaf) + + # quote_char = fstring_end.value[0] + # quote_idx = fstring_start.value.index(quote_char) + # prefix, quote = ( + # fstring_start.value[:quote_idx], + # fstring_start.value[quote_idx:] + # ) + + # if not is_docstring(node, self.mode): + # prefix = normalize_string_prefix(prefix) + + # assert quote == fstring_end.value + + # is_raw_fstring = "r" in prefix or "R" in prefix + # middles = [ + # leaf + # for leaf in node.leaves() + # if leaf.type == token.FSTRING_MIDDLE + # ] + + # if self.mode.string_normalization: + # middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring) + + # fstring_start.value = prefix + quote + # fstring_end.value = quote + + # yield from self.visit_default(node) + def __post_init__(self) -> None: """You are in a twisty little maze of passages.""" self.current_line = Line(mode=self.mode) @@ -535,6 +574,12 @@ def __post_init__(self) -> None: self.visit_guard = partial(v, keywords=Ø, parens={"if"}) +def _fstring_to_string(node: Node) -> Leaf: + """Converts an fstring node back to a string node.""" + string_without_prefix = str(node)[len(node.prefix) :] + return Leaf(token.STRING, string_without_prefix, prefix=node.prefix) + + def _hugging_power_ops_line_to_string( line: Line, features: Collection[Feature], diff --git a/src/black/lines.py b/src/black/lines.py index 6b65372fb3f..b9b3add9e2a 100644 --- a/src/black/lines.py +++ b/src/black/lines.py @@ -72,7 +72,12 @@ def append( Inline comments are put aside. """ - has_value = leaf.type in BRACKETS or bool(leaf.value.strip()) + has_value = ( + leaf.type in BRACKETS + # empty fstring-middles must not be truncated + or leaf.type == token.FSTRING_MIDDLE + or bool(leaf.value.strip()) + ) if not has_value: return diff --git a/src/black/mode.py b/src/black/mode.py index b54f355e20a..e3b2d5dbadd 100644 --- a/src/black/mode.py +++ b/src/black/mode.py @@ -46,6 +46,7 @@ class Feature(Enum): DEBUG_F_STRINGS = 16 PARENTHESIZED_CONTEXT_MANAGERS = 17 TYPE_PARAMS = 18 + FSTRING_PARSING = 19 FORCE_OPTIONAL_PARENTHESES = 50 # __future__ flags @@ -156,6 +157,7 @@ class Feature(Enum): Feature.EXCEPT_STAR, Feature.VARIADIC_GENERICS, Feature.TYPE_PARAMS, + Feature.FSTRING_PARSING, }, } diff --git a/src/black/nodes.py b/src/black/nodes.py index c0dca6e5783..f75e0848663 100644 --- a/src/black/nodes.py +++ b/src/black/nodes.py @@ -145,7 +145,13 @@ OPENING_BRACKETS: Final = set(BRACKET.keys()) CLOSING_BRACKETS: Final = set(BRACKET.values()) BRACKETS: Final = OPENING_BRACKETS | CLOSING_BRACKETS -ALWAYS_NO_SPACE: Final = CLOSING_BRACKETS | {token.COMMA, STANDALONE_COMMENT} +ALWAYS_NO_SPACE: Final = CLOSING_BRACKETS | { + token.COMMA, + STANDALONE_COMMENT, + token.FSTRING_MIDDLE, + token.FSTRING_END, + token.BANG, +} RARROW = 55 @@ -211,6 +217,9 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool, mode: Mode) -> str: # no }: return NO + if t == token.LBRACE and p.type == syms.fstring_replacement_field: + return NO + prev = leaf.prev_sibling if not prev: prevp = preceding_leaf(p) @@ -272,6 +281,9 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool, mode: Mode) -> str: # no elif prev.type in OPENING_BRACKETS: return NO + elif prev.type == token.BANG: + return NO + if p.type in {syms.parameters, syms.arglist}: # untyped function signatures or calls if not prev or prev.type != token.COMMA: @@ -393,6 +405,7 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool, mode: Mode) -> str: # no elif prevp.type == token.EQUAL and prevp_parent.type == syms.argument: return NO + # TODO: add fstring here? elif t in {token.NAME, token.NUMBER, token.STRING}: return NO @@ -542,31 +555,32 @@ def is_arith_like(node: LN) -> bool: } -def is_docstring(leaf: Leaf, mode: Mode) -> bool: - if leaf.type != token.STRING: - return False +def is_docstring(node: NL, mode: Mode) -> bool: + if isinstance(node, Leaf): + if node.type != token.STRING: + return False - prefix = get_string_prefix(leaf.value) - if set(prefix).intersection("bBfF"): - return False + prefix = get_string_prefix(node.value) + if set(prefix).intersection("bBfF"): + return False if ( Preview.unify_docstring_detection in mode - and leaf.parent - and leaf.parent.type == syms.simple_stmt - and not leaf.parent.prev_sibling - and leaf.parent.parent - and leaf.parent.parent.type == syms.file_input + and node.parent + and node.parent.type == syms.simple_stmt + and not node.parent.prev_sibling + and node.parent.parent + and node.parent.parent.type == syms.file_input ): return True if prev_siblings_are( - leaf.parent, [None, token.NEWLINE, token.INDENT, syms.simple_stmt] + node.parent, [None, token.NEWLINE, token.INDENT, syms.simple_stmt] ): return True # Multiline docstring on the same line as the `def`. - if prev_siblings_are(leaf.parent, [syms.parameters, token.COLON, syms.simple_stmt]): + if prev_siblings_are(node.parent, [syms.parameters, token.COLON, syms.simple_stmt]): # `syms.parameters` is only used in funcdefs and async_funcdefs in the Python # grammar. We're safe to return True without further checks. return True @@ -954,10 +968,6 @@ def is_rpar_token(nl: NL) -> TypeGuard[Leaf]: return nl.type == token.RPAR -def is_string_token(nl: NL) -> TypeGuard[Leaf]: - return nl.type == token.STRING - - def is_number_token(nl: NL) -> TypeGuard[Leaf]: return nl.type == token.NUMBER diff --git a/src/black/strings.py b/src/black/strings.py index baa88162844..69a8c8002e9 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -5,7 +5,7 @@ import re import sys from functools import lru_cache -from typing import Final, List, Match, Pattern +from typing import Final, List, Match, Pattern, Tuple from black._width_table import WIDTH_TABLE from blib2to3.pytree import Leaf @@ -169,8 +169,7 @@ def _cached_compile(pattern: str) -> Pattern[str]: def normalize_string_quotes(s: str) -> str: """Prefer double quotes but only if it doesn't cause more escaping. - Adds or removes backslashes as appropriate. Doesn't parse and fix - strings nested in f-strings. + Adds or removes backslashes as appropriate. """ value = s.lstrip(STRING_PREFIX_CHARS) if value[:3] == '"""': @@ -211,6 +210,7 @@ def normalize_string_quotes(s: str) -> str: s = f"{prefix}{orig_quote}{body}{orig_quote}" new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body) new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body) + if "f" in prefix.casefold(): matches = re.findall( r""" @@ -240,6 +240,71 @@ def normalize_string_quotes(s: str) -> str: return f"{prefix}{new_quote}{new_body}{new_quote}" +def normalize_fstring_quotes( + quote: str, + middles: List[Leaf], + is_raw_fstring: bool, +) -> Tuple[List[Leaf], str]: + """Prefer double quotes but only if it doesn't cause more escaping. + + Adds or removes backslashes as appropriate. + """ + if quote == '"""': + return middles, quote + + elif quote == "'''": + new_quote = '"""' + elif quote == '"': + new_quote = "'" + else: + new_quote = '"' + + unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") + escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") + escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){quote}") + if is_raw_fstring: + for middle in middles: + if unescaped_new_quote.search(middle.value): + # There's at least one unescaped new_quote in this raw string + # so converting is impossible + return middles, quote + + # Do not introduce or remove backslashes in raw strings, just use double quote + return middles, '"' + + new_segments = [] + for middle in middles: + segment = middle.value + # remove unnecessary escapes + new_segment = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", segment) + if segment != new_segment: + # Consider the string without unnecessary escapes as the original + middle.value = new_segment + + new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment) + new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment) + new_segments.append(new_segment) + + if new_quote == '"""' and new_segments[-1].endswith('"'): + # edge case: + new_segments[-1] = new_segments[-1][:-1] + '\\"' + + for middle, new_segment in zip(middles, new_segments): + orig_escape_count = middle.value.count("\\") + new_escape_count = new_segment.count("\\") + + if new_escape_count > orig_escape_count: + return middles, quote # Do not introduce more escaping + + if new_escape_count == orig_escape_count and quote == '"': + return middles, quote # Prefer double quotes + + for middle, new_segment in zip(middles, new_segments): + middle.value = new_segment + + return middles, new_quote + + def normalize_unicode_escape_sequences(leaf: Leaf) -> None: """Replace hex codes in Unicode escape sequences with lowercase representation.""" text = leaf.value diff --git a/src/blib2to3/Grammar.txt b/src/blib2to3/Grammar.txt index 5db78723cec..0c8ac99daba 100644 --- a/src/blib2to3/Grammar.txt +++ b/src/blib2to3/Grammar.txt @@ -163,7 +163,7 @@ atom: ('(' [yield_expr|testlist_gexp] ')' | '[' [listmaker] ']' | '{' [dictsetmaker] '}' | '`' testlist1 '`' | - NAME | NUMBER | STRING+ | '.' '.' '.') + NAME | NUMBER | (STRING | fstring)+ | '.' '.' '.') listmaker: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] ) testlist_gexp: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] ) lambdef: 'lambda' [varargslist] ':' test @@ -254,3 +254,8 @@ case_block: "case" patterns [guard] ':' suite guard: 'if' namedexpr_test patterns: pattern (',' pattern)* [','] pattern: (expr|star_expr) ['as' expr] + +fstring: FSTRING_START fstring_middle* FSTRING_END +fstring_middle: fstring_replacement_field | FSTRING_MIDDLE +fstring_replacement_field: '{' (yield_expr | testlist_star_expr) ['='] [ "!" NAME ] [ ':' fstring_format_spec* ] '}' +fstring_format_spec: FSTRING_MIDDLE | fstring_replacement_field diff --git a/src/blib2to3/pgen2/driver.py b/src/blib2to3/pgen2/driver.py index be3984437a8..71a147cbcd8 100644 --- a/src/blib2to3/pgen2/driver.py +++ b/src/blib2to3/pgen2/driver.py @@ -167,7 +167,9 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> if type in {token.INDENT, token.DEDENT}: prefix = _prefix lineno, column = end - if value.endswith("\n"): + # FSTRING_MIDDLE is the only token that can end with a newline, and + # `end` will point to the next line. For that case, don't increment lineno. + if value.endswith("\n") and type != token.FSTRING_MIDDLE: lineno += 1 column = 0 else: diff --git a/src/blib2to3/pgen2/grammar.py b/src/blib2to3/pgen2/grammar.py index 1f3fdc55b97..804db1ad985 100644 --- a/src/blib2to3/pgen2/grammar.py +++ b/src/blib2to3/pgen2/grammar.py @@ -218,6 +218,7 @@ def report(self) -> None: //= DOUBLESLASHEQUAL -> RARROW := COLONEQUAL +! BANG """ opmap = {} diff --git a/src/blib2to3/pgen2/token.py b/src/blib2to3/pgen2/token.py index ed2fc4e85fc..3068c3157fc 100644 --- a/src/blib2to3/pgen2/token.py +++ b/src/blib2to3/pgen2/token.py @@ -66,7 +66,11 @@ ASYNC: Final = 57 ERRORTOKEN: Final = 58 COLONEQUAL: Final = 59 -N_TOKENS: Final = 60 +FSTRING_START: Final = 60 +FSTRING_MIDDLE: Final = 61 +FSTRING_END: Final = 62 +BANG: Final = 63 +N_TOKENS: Final = 64 NT_OFFSET: Final = 256 # --end constants-- diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index b04b18ba870..d6b684ab1aa 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -27,6 +27,7 @@ function to which the 5 fields described above are passed as 5 arguments, each time a new token is found.""" +import builtins import sys from typing import ( Callable, @@ -49,12 +50,17 @@ DEDENT, ENDMARKER, ERRORTOKEN, + FSTRING_END, + FSTRING_MIDDLE, + FSTRING_START, INDENT, + LBRACE, NAME, NEWLINE, NL, NUMBER, OP, + RBRACE, STRING, tok_name, ) @@ -120,14 +126,32 @@ def _combinations(*l: str) -> Set[str]: Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" # Tail end of """ string. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?" -Triple = group(_litprefix + "'''", _litprefix + '"""') -# Single-line ' or " string. -String = group( - _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", - _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"', +_litprefix = r"(?:[uUrRbB]|[rR][bB]|[bBuU][rR])?" +_fstringlitprefix = r"(?:rF|FR|Fr|fr|RF|F|rf|f|Rf|fR)" +Triple = group( + _litprefix + "'''", + _litprefix + '"""', + _fstringlitprefix + '"""', + _fstringlitprefix + "'''", ) +# beginning of a single quoted f-string. must not end with `{{` or `\N{` +SingleLbrace = r"[^'\\{]*(?:(?:\\N{|\\.|{{)[^'\\{]*)*(? Set[str]: Special = group(r"\r?\n", r"[:;.,`@]") Funny = group(Operator, Bracket, Special) +_string_middle_single = r"[^\n'\\]*(?:\\.[^\n'\\]*)*" +_string_middle_double = r'[^\n"\\]*(?:\\.[^\n"\\]*)*' + +# FSTRING_MIDDLE and LBRACE, must not end with a `{{` or `\N{` +_fstring_middle_single = r"[^\n'{]*(?:(?:\\N{|\\[^{]|{{)[^\n'{]*)*(? str: return ut.untokenize(iterable) +def is_fstring_start(token: str) -> bool: + return builtins.any(token.startswith(prefix) for prefix in fstring_prefix) + + +def _split_fstring_start_and_middle(token: str) -> Tuple[str, str]: + for prefix in fstring_prefix: + _, prefix, rest = token.partition(prefix) + if prefix != "": + return prefix, rest + + raise ValueError(f"Token {token!r} is not a valid f-string start") + + def generate_tokens( readline: Callable[[], str], grammar: Optional[Grammar] = None ) -> Iterator[GoodTokenInfo]: @@ -433,7 +498,12 @@ def generate_tokens( and the line on which the token was found. The line passed is the logical line; continuation lines are included. """ - lnum = parenlev = continued = 0 + lnum = parenlev = fstring_level = continued = 0 + parenlev_stack: List[int] = [] + inside_fstring_braces = False + inside_fstring_colon = False + formatspec = "" + bracelev = 0 numchars: Final[str] = "0123456789" contstr, needcont = "", 0 contline: Optional[str] = None @@ -449,7 +519,8 @@ def generate_tokens( async_def_nl = False strstart: Tuple[int, int] - endprog: Pattern[str] + endprog_stack: List[Pattern[str]] = [] + formatspec_start: Tuple[int, int] while 1: # loop over lines in stream try: @@ -463,16 +534,72 @@ def generate_tokens( assert contline is not None if not line: raise TokenError("EOF in multi-line string", strstart) + endprog = endprog_stack[-1] endmatch = endprog.match(line) if endmatch: - pos = end = endmatch.end(0) - yield ( - STRING, - contstr + line[:end], - strstart, - (lnum, end), - contline + line, - ) + end = endmatch.end(0) + token = contstr + line[:end] + spos = strstart + epos = (lnum, end) + tokenline = contline + line + if fstring_level == 0 and not is_fstring_start(token): + yield (STRING, token, spos, epos, tokenline) + endprog_stack.pop() + parenlev = parenlev_stack.pop() + else: + if is_fstring_start(token): + fstring_level += 1 + fstring_start, token = _split_fstring_start_and_middle(token) + fstring_start_epos = (lnum, spos[1] + len(fstring_start)) + yield ( + FSTRING_START, + fstring_start, + spos, + fstring_start_epos, + tokenline, + ) + # increase spos to the end of the fstring start + spos = fstring_start_epos + + if token.endswith("{"): + fstring_middle, lbrace = token[:-1], token[-1] + fstring_middle_epos = lbrace_spos = (lnum, end - 1) + yield ( + FSTRING_MIDDLE, + fstring_middle, + spos, + fstring_middle_epos, + line, + ) + yield (LBRACE, lbrace, lbrace_spos, epos, line) + inside_fstring_braces = True + else: + if token.endswith(('"""', "'''")): + fstring_middle, fstring_end = token[:-3], token[-3:] + fstring_middle_epos = end_spos = (lnum, end - 3) + else: + fstring_middle, fstring_end = token[:-1], token[-1] + fstring_middle_epos = end_spos = (lnum, end - 1) + yield ( + FSTRING_MIDDLE, + fstring_middle, + spos, + fstring_middle_epos, + line, + ) + yield ( + FSTRING_END, + fstring_end, + end_spos, + epos, + line, + ) + fstring_level -= 1 + endprog_stack.pop() + parenlev = parenlev_stack.pop() + if fstring_level > 0: + inside_fstring_braces = True + pos = end contstr, needcont = "", 0 contline = None elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n": @@ -491,7 +618,8 @@ def generate_tokens( contline = contline + line continue - elif parenlev == 0 and not continued: # new statement + # new statement + elif parenlev == 0 and not continued and not inside_fstring_braces: if not line: break column = 0 @@ -559,6 +687,98 @@ def generate_tokens( continued = 0 while pos < max: + if fstring_level > 0 and not inside_fstring_braces: + endprog = endprog_stack[-1] + endmatch = endprog.match(line, pos) + if endmatch: # all on one line + start, end = endmatch.span(0) + token = line[start:end] + if token.endswith(('"""', "'''")): + middle_token, end_token = token[:-3], token[-3:] + middle_epos = end_spos = (lnum, end - 3) + else: + middle_token, end_token = token[:-1], token[-1] + middle_epos = end_spos = (lnum, end - 1) + # TODO: unsure if this can be safely removed + if stashed: + yield stashed + stashed = None + yield ( + FSTRING_MIDDLE, + middle_token, + (lnum, pos), + middle_epos, + line, + ) + if not token.endswith("{"): + yield ( + FSTRING_END, + end_token, + end_spos, + (lnum, end), + line, + ) + fstring_level -= 1 + endprog_stack.pop() + parenlev = parenlev_stack.pop() + if fstring_level > 0: + inside_fstring_braces = True + else: + yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line) + inside_fstring_braces = True + pos = end + continue + else: # multiple lines + strstart = (lnum, end) + contstr = line[end:] + contline = line + break + + if inside_fstring_colon: + match = fstring_middle_after_colon.match(line, pos) + if match is None: + formatspec += line[pos:] + pos = max + continue + + start, end = match.span(1) + token = line[start:end] + formatspec += token + + brace_start, brace_end = match.span(2) + brace_or_nl = line[brace_start:brace_end] + if brace_or_nl == "\n": + pos = brace_end + + yield (FSTRING_MIDDLE, formatspec, formatspec_start, (lnum, end), line) + formatspec = "" + + if brace_or_nl == "{": + yield (OP, "{", (lnum, brace_start), (lnum, brace_end), line) + bracelev += 1 + end = brace_end + + inside_fstring_colon = False + pos = end + continue + + if inside_fstring_braces and parenlev == 0: + match = bang.match(line, pos) + if match: + start, end = match.span(1) + yield (OP, "!", (lnum, start), (lnum, end), line) + pos = end + continue + + match = colon.match(line, pos) + if match: + start, end = match.span(1) + yield (OP, ":", (lnum, start), (lnum, end), line) + inside_fstring_colon = True + formatspec_start = (lnum, end) + pos = end + continue + pseudomatch = pseudoprog.match(line, pos) if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) @@ -571,7 +791,7 @@ def generate_tokens( yield (NUMBER, token, spos, epos, line) elif initial in "\r\n": newline = NEWLINE - if parenlev > 0: + if parenlev > 0 or inside_fstring_braces: newline = NL elif async_def: async_def_nl = True @@ -588,17 +808,72 @@ def generate_tokens( yield (COMMENT, token, spos, epos, line) elif token in triple_quoted: endprog = endprogs[token] + endprog_stack.append(endprog) + parenlev_stack.append(parenlev) + parenlev = 0 + if is_fstring_start(token): + yield (FSTRING_START, token, spos, epos, line) + fstring_level += 1 + endmatch = endprog.match(line, pos) if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] if stashed: yield stashed stashed = None - yield (STRING, token, spos, (lnum, pos), line) + if not is_fstring_start(token): + pos = endmatch.end(0) + token = line[start:pos] + epos = (lnum, pos) + yield (STRING, token, spos, epos, line) + endprog_stack.pop() + parenlev = parenlev_stack.pop() + else: + end = endmatch.end(0) + token = line[pos:end] + spos, epos = (lnum, pos), (lnum, end) + if not token.endswith("{"): + fstring_middle, fstring_end = token[:-3], token[-3:] + fstring_middle_epos = fstring_end_spos = (lnum, end - 3) + yield ( + FSTRING_MIDDLE, + fstring_middle, + spos, + fstring_middle_epos, + line, + ) + yield ( + FSTRING_END, + fstring_end, + fstring_end_spos, + epos, + line, + ) + fstring_level -= 1 + endprog_stack.pop() + parenlev = parenlev_stack.pop() + if fstring_level > 0: + inside_fstring_braces = True + else: + fstring_middle, lbrace = token[:-1], token[-1] + fstring_middle_epos = lbrace_spos = (lnum, end - 1) + yield ( + FSTRING_MIDDLE, + fstring_middle, + spos, + fstring_middle_epos, + line, + ) + yield (LBRACE, lbrace, lbrace_spos, epos, line) + inside_fstring_braces = True + pos = end else: - strstart = (lnum, start) # multiple lines - contstr = line[start:] + # multiple lines + if is_fstring_start(token): + strstart = (lnum, pos) + contstr = line[pos:] + else: + strstart = (lnum, start) + contstr = line[start:] contline = line break elif ( @@ -606,17 +881,18 @@ def generate_tokens( or token[:2] in single_quoted or token[:3] in single_quoted ): + maybe_endprog = ( + endprogs.get(initial) + or endprogs.get(token[:2]) + or endprogs.get(token[:3]) + ) + assert maybe_endprog is not None, f"endprog not found for {token}" + endprog = maybe_endprog if token[-1] == "\n": # continued string + endprog_stack.append(endprog) + parenlev_stack.append(parenlev) + parenlev = 0 strstart = (lnum, start) - maybe_endprog = ( - endprogs.get(initial) - or endprogs.get(token[1]) - or endprogs.get(token[2]) - ) - assert ( - maybe_endprog is not None - ), f"endprog not found for {token}" - endprog = maybe_endprog contstr, needcont = line[start:], 1 contline = line break @@ -624,7 +900,57 @@ def generate_tokens( if stashed: yield stashed stashed = None - yield (STRING, token, spos, epos, line) + + if not is_fstring_start(token): + yield (STRING, token, spos, epos, line) + else: + if pseudomatch[20] is not None: + fstring_start = pseudomatch[20] + offset = pseudomatch.end(20) - pseudomatch.start(1) + elif pseudomatch[22] is not None: + fstring_start = pseudomatch[22] + offset = pseudomatch.end(22) - pseudomatch.start(1) + elif pseudomatch[24] is not None: + fstring_start = pseudomatch[24] + offset = pseudomatch.end(24) - pseudomatch.start(1) + else: + fstring_start = pseudomatch[26] + offset = pseudomatch.end(26) - pseudomatch.start(1) + + start_epos = (lnum, start + offset) + yield (FSTRING_START, fstring_start, spos, start_epos, line) + fstring_level += 1 + endprog = endprogs[fstring_start] + endprog_stack.append(endprog) + parenlev_stack.append(parenlev) + parenlev = 0 + + end_offset = pseudomatch.end(1) - 1 + fstring_middle = line[start + offset : end_offset] + middle_spos = (lnum, start + offset) + middle_epos = (lnum, end_offset) + yield ( + FSTRING_MIDDLE, + fstring_middle, + middle_spos, + middle_epos, + line, + ) + if not token.endswith("{"): + end_spos = (lnum, end_offset) + end_epos = (lnum, end_offset + 1) + yield (FSTRING_END, token[-1], end_spos, end_epos, line) + fstring_level -= 1 + endprog_stack.pop() + parenlev = parenlev_stack.pop() + if fstring_level > 0: + inside_fstring_braces = True + else: + end_spos = (lnum, end_offset) + end_epos = (lnum, end_offset + 1) + yield (LBRACE, "{", end_spos, end_epos, line) + inside_fstring_braces = True + elif initial.isidentifier(): # ordinary name if token in ("async", "await"): if async_keywords or async_def: @@ -669,8 +995,22 @@ def generate_tokens( stashed = None yield (NL, token, spos, (lnum, pos), line) continued = 1 + elif ( + initial == "}" + and parenlev == 0 + and bracelev == 0 + and fstring_level > 0 + ): + yield (RBRACE, token, spos, epos, line) + inside_fstring_braces = False else: - if initial in "([{": + if parenlev == 0 and bracelev > 0 and initial == "}": + bracelev -= 1 + # if we're still inside fstrings, we're still part of the format spec + if inside_fstring_braces: + inside_fstring_colon = True + formatspec_start = (lnum, pos) + elif initial in "([{": parenlev += 1 elif initial in ")]}": parenlev -= 1 @@ -689,6 +1029,8 @@ def generate_tokens( for _indent in indents[1:]: # pop remaining indent levels yield (DEDENT, "", (lnum, 0), (lnum, 0), "") yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") + assert len(endprog_stack) == 0 + assert len(parenlev_stack) == 0 if __name__ == "__main__": # testing diff --git a/src/blib2to3/pygram.py b/src/blib2to3/pygram.py index 2b43b4c112b..70a5684bb07 100644 --- a/src/blib2to3/pygram.py +++ b/src/blib2to3/pygram.py @@ -70,6 +70,10 @@ class _python_symbols(Symbols): file_input: int flow_stmt: int for_stmt: int + fstring: int + fstring_format_spec: int + fstring_middle: int + fstring_replacement_field: int funcdef: int global_stmt: int guard: int diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py new file mode 100644 index 00000000000..c5bc48e95f2 --- /dev/null +++ b/tests/data/cases/pep_701.py @@ -0,0 +1,224 @@ +# flags: --minimum-version=3.12 +x = f"foo" +x = f'foo' +x = f"""foo""" +x = f'''foo''' +x = f"foo {{ bar {{ baz" +x = f"foo {{ {2 + 2}bar {{ baz" +x = f'foo {{ {2 + 2}bar {{ baz' +x = f"""foo {{ {2 + 2}bar {{ baz""" +x = f'''foo {{ {2 + 2}bar {{ baz''' + +# edge case: FSTRING_MIDDLE containing only whitespace should not be stripped +x = f"{a} {b}" + +x = f"foo { + 2 + 2 +} bar baz" + +x = f"foo {{ {"a {2 + 2} b"}bar {{ baz" +x = f"foo {{ {f'a {2 + 2} b'}bar {{ baz" +x = f"foo {{ {f"a {2 + 2} b"}bar {{ baz" + +x = f"foo {{ {f'a {f"a {2 + 2} b"} b'}bar {{ baz" +x = f"foo {{ {f"a {f"a {2 + 2} b"} b"}bar {{ baz" + +x = """foo {{ {2 + 2}bar +baz""" + + +x = f"""foo {{ {2 + 2}bar {{ baz""" + +x = f"""foo {{ { + 2 + 2 +}bar {{ baz""" + + +x = f"""foo {{ { + 2 + 2 +}bar +baz""" + +x = f"""foo {{ a + foo {2 + 2}bar {{ baz + + x = f"foo {{ { + 2 + 2 # comment + }bar" + + {{ baz + + }} buzz + + {print("abc" + "def" +)} +abc""" + +# edge case: end triple quotes at index zero +f"""foo {2+2} bar +""" + +f' \' {f"'"} \' ' +f" \" {f'"'} \" " + +x = f"a{2+2:=^72}b" +x = f"a{2+2:x}b" + +rf'foo' +rf'{foo}' + +f"{x:{y}d}" + +x = f"a{2+2:=^{x}}b" +x = f"a{2+2:=^{foo(x+y**2):something else}}b" +x = f"a{2+2:=^{foo(x+y**2):something else}one more}b" +f'{(abc:=10)}' + +f"This is a really long string, but just make sure that you reflow fstrings { + 2+2:d +}" +f"This is a really long string, but just make sure that you reflow fstrings correctly {2+2:d}" + +f"{2+2=}" +f"{2+2 = }" +f"{ 2 + 2 = }" + +f"""foo { + datetime.datetime.now():%Y +%m +%d +}""" + +f"{ +X +!r +}" + +raise ValueError( + "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" + f" {lines_str!r}" + ) + +f"`escape` only permitted in {{'html', 'latex', 'latex-math'}}, \ +got {escape}" + +x = f'\N{GREEK CAPITAL LETTER DELTA} \N{SNOWMAN} {x}' +fr'\{{\}}' + +f""" + WITH {f''' + {1}_cte AS ()'''} +""" + +# output + +x = f"foo" +x = f"foo" +x = f"""foo""" +x = f"""foo""" +x = f"foo {{ bar {{ baz" +x = f"foo {{ {2 + 2}bar {{ baz" +x = f"foo {{ {2 + 2}bar {{ baz" +x = f"""foo {{ {2 + 2}bar {{ baz""" +x = f"""foo {{ {2 + 2}bar {{ baz""" + +# edge case: FSTRING_MIDDLE containing only whitespace should not be stripped +x = f"{a} {b}" + +x = f"foo { + 2 + 2 +} bar baz" + +x = f"foo {{ {"a {2 + 2} b"}bar {{ baz" +x = f"foo {{ {f'a {2 + 2} b'}bar {{ baz" +x = f"foo {{ {f"a {2 + 2} b"}bar {{ baz" + +x = f"foo {{ {f'a {f"a {2 + 2} b"} b'}bar {{ baz" +x = f"foo {{ {f"a {f"a {2 + 2} b"} b"}bar {{ baz" + +x = """foo {{ {2 + 2}bar +baz""" + + +x = f"""foo {{ {2 + 2}bar {{ baz""" + +x = f"""foo {{ { + 2 + 2 +}bar {{ baz""" + + +x = f"""foo {{ { + 2 + 2 +}bar +baz""" + +x = f"""foo {{ a + foo {2 + 2}bar {{ baz + + x = f"foo {{ { + 2 + 2 # comment + }bar" + + {{ baz + + }} buzz + + {print("abc" + "def" +)} +abc""" + +# edge case: end triple quotes at index zero +f"""foo {2+2} bar +""" + +f' \' {f"'"} \' ' +f" \" {f'"'} \" " + +x = f"a{2+2:=^72}b" +x = f"a{2+2:x}b" + +rf"foo" +rf"{foo}" + +f"{x:{y}d}" + +x = f"a{2+2:=^{x}}b" +x = f"a{2+2:=^{foo(x+y**2):something else}}b" +x = f"a{2+2:=^{foo(x+y**2):something else}one more}b" +f"{(abc:=10)}" + +f"This is a really long string, but just make sure that you reflow fstrings { + 2+2:d +}" +f"This is a really long string, but just make sure that you reflow fstrings correctly {2+2:d}" + +f"{2+2=}" +f"{2+2 = }" +f"{ 2 + 2 = }" + +f"""foo { + datetime.datetime.now():%Y +%m +%d +}""" + +f"{ +X +!r +}" + +raise ValueError( + "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" + f" {lines_str!r}" +) + +f"`escape` only permitted in {{'html', 'latex', 'latex-math'}}, \ +got {escape}" + +x = f"\N{GREEK CAPITAL LETTER DELTA} \N{SNOWMAN} {x}" +rf"\{{\}}" + +f""" + WITH {f''' + {1}_cte AS ()'''} +""" diff --git a/tests/data/miscellaneous/debug_visitor.out b/tests/data/miscellaneous/debug_visitor.out index fa60010d421..24d7ed82472 100644 --- a/tests/data/miscellaneous/debug_visitor.out +++ b/tests/data/miscellaneous/debug_visitor.out @@ -229,8 +229,34 @@ file_input LPAR '(' arglist - STRING - "f'{indent}{_type}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + 'indent' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + '_type' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument @@ -370,8 +396,34 @@ file_input LPAR '(' arglist - STRING - "f'{indent}/{_type}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + 'indent' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '/' + fstring_replacement_field + LBRACE + '{' + NAME + '_type' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument @@ -494,8 +546,34 @@ file_input LPAR '(' arglist - STRING - "f'{indent}{_type}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + 'indent' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + '_type' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument @@ -557,8 +635,36 @@ file_input LPAR '(' arglist - STRING - "f' {node.prefix!r}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + ' ' + fstring_replacement_field + LBRACE + '{' + power + NAME + 'node' + trailer + DOT + '.' + NAME + 'prefix' + /trailer + /power + BANG + '!' + NAME + 'r' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument @@ -613,8 +719,36 @@ file_input LPAR '(' arglist - STRING - "f' {node.value!r}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + ' ' + fstring_replacement_field + LBRACE + '{' + power + NAME + 'node' + trailer + DOT + '.' + NAME + 'value' + /trailer + /power + BANG + '!' + NAME + 'r' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument diff --git a/tests/test_black.py b/tests/test_black.py index 2e3ae4503f5..498eb06fc39 100644 --- a/tests/test_black.py +++ b/tests/test_black.py @@ -343,12 +343,11 @@ def test_detect_debug_f_strings(self) -> None: features = black.get_features_used(root) self.assertNotIn(black.Feature.DEBUG_F_STRINGS, features) - # We don't yet support feature version detection in nested f-strings root = black.lib2to3_parse( """f"heard a rumour that { f'{1+1=}' } ... seems like it could be true" """ ) features = black.get_features_used(root) - self.assertNotIn(black.Feature.DEBUG_F_STRINGS, features) + self.assertIn(black.Feature.DEBUG_F_STRINGS, features) @patch("black.dump_to_file", dump_to_stderr) def test_string_quotes(self) -> None: