Add PEP 701 support (#3822)

Co-authored-by: Shantanu <12621235+hauntsaninja@users.noreply.github.com> Co-authored-by: hauntsaninja <hauntsaninja@gmail.com> Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
psf · Apr 22, 2024 · 551ede2 · 551ede2
1 parent 944b99a
commit 551ede2
Show file tree

Hide file tree

Showing 16 changed files with 941 additions and 102 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,6 +6,8 @@
 
 <!-- Include any especially major or disruptive changes here -->
 
+- Add support for the new Python 3.12 f-string syntax introduced by PEP 701 (#3822)
+
 ### Stable style
 
 <!-- Changes that affect Black's stable style -->

diff --git a/src/black/__init__.py b/src/black/__init__.py
@@ -69,13 +69,7 @@
 from black.mode import FUTURE_FLAG_TO_FEATURE, VERSION_TO_FEATURES, Feature
 from black.mode import Mode as Mode  # re-exported
 from black.mode import Preview, TargetVersion, supports_feature
-from black.nodes import (
-    STARS,
-    is_number_token,
-    is_simple_decorator_expression,
-    is_string_token,
-    syms,
-)
+from black.nodes import STARS, is_number_token, is_simple_decorator_expression, syms
 from black.output import color_diff, diff, dump_to_file, err, ipynb_diff, out
 from black.parsing import (  # noqa F401
     ASTSafetyError,
@@ -91,7 +85,6 @@
     sanitized_lines,
 )
 from black.report import Changed, NothingChanged, Report
-from black.trans import iter_fexpr_spans
 from blib2to3.pgen2 import token
 from blib2to3.pytree import Leaf, Node
 
@@ -1265,7 +1258,10 @@ def _format_str_once(
     elt = EmptyLineTracker(mode=mode)
     split_line_features = {
         feature
-        for feature in {Feature.TRAILING_COMMA_IN_CALL, Feature.TRAILING_COMMA_IN_DEF}
+        for feature in {
+            Feature.TRAILING_COMMA_IN_CALL,
+            Feature.TRAILING_COMMA_IN_DEF,
+        }
         if supports_feature(versions, feature)
     }
     block: Optional[LinesBlock] = None
@@ -1337,15 +1333,14 @@ def get_features_used(  # noqa: C901
         }
 
     for n in node.pre_order():
-        if is_string_token(n):
-            value_head = n.value[:2]
-            if value_head in {'f"', 'F"', "f'", "F'", "rf", "fr", "RF", "FR"}:
-                features.add(Feature.F_STRINGS)
-                if Feature.DEBUG_F_STRINGS not in features:
-                    for span_beg, span_end in iter_fexpr_spans(n.value):
-                        if n.value[span_beg : span_end - 1].rstrip().endswith("="):
-                            features.add(Feature.DEBUG_F_STRINGS)
-                            break
+        if n.type == token.FSTRING_START:
+            features.add(Feature.F_STRINGS)
+        elif (
+            n.type == token.RBRACE
+            and n.parent is not None
+            and any(child.type == token.EQUAL for child in n.parent.children)
+        ):
+            features.add(Feature.DEBUG_F_STRINGS)
 
         elif is_number_token(n):
             if "_" in n.value:

diff --git a/src/black/linegen.py b/src/black/linegen.py
@@ -502,6 +502,45 @@ def visit_NUMBER(self, leaf: Leaf) -> Iterator[Line]:
         normalize_numeric_literal(leaf)
         yield from self.visit_default(leaf)
 
+    def visit_fstring(self, node: Node) -> Iterator[Line]:
+        # currently we don't want to format and split f-strings at all.
+        string_leaf = _fstring_to_string(node)
+        node.replace(string_leaf)
+        yield from self.visit_STRING(string_leaf)
+
+        # TODO: Uncomment Implementation to format f-string children
+        # fstring_start = node.children[0]
+        # fstring_end = node.children[-1]
+        # assert isinstance(fstring_start, Leaf)
+        # assert isinstance(fstring_end, Leaf)
+
+        # quote_char = fstring_end.value[0]
+        # quote_idx = fstring_start.value.index(quote_char)
+        # prefix, quote = (
+        #     fstring_start.value[:quote_idx],
+        #     fstring_start.value[quote_idx:]
+        # )
+
+        # if not is_docstring(node, self.mode):
+        #     prefix = normalize_string_prefix(prefix)
+
+        # assert quote == fstring_end.value
+
+        # is_raw_fstring = "r" in prefix or "R" in prefix
+        # middles = [
+        #     leaf
+        #     for leaf in node.leaves()
+        #     if leaf.type == token.FSTRING_MIDDLE
+        # ]
+
+        # if self.mode.string_normalization:
+        #     middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring)
+
+        # fstring_start.value = prefix + quote
+        # fstring_end.value = quote
+
+        # yield from self.visit_default(node)
+
     def __post_init__(self) -> None:
         """You are in a twisty little maze of passages."""
         self.current_line = Line(mode=self.mode)
@@ -535,6 +574,12 @@ def __post_init__(self) -> None:
             self.visit_guard = partial(v, keywords=Ø, parens={"if"})
 
 
+def _fstring_to_string(node: Node) -> Leaf:
+    """Converts an fstring node back to a string node."""
+    string_without_prefix = str(node)[len(node.prefix) :]
+    return Leaf(token.STRING, string_without_prefix, prefix=node.prefix)
+
+
 def _hugging_power_ops_line_to_string(
     line: Line,
     features: Collection[Feature],

diff --git a/src/black/lines.py b/src/black/lines.py
@@ -72,7 +72,12 @@ def append(
 
         Inline comments are put aside.
         """
-        has_value = leaf.type in BRACKETS or bool(leaf.value.strip())
+        has_value = (
+            leaf.type in BRACKETS
+            # empty fstring-middles must not be truncated
+            or leaf.type == token.FSTRING_MIDDLE
+            or bool(leaf.value.strip())
+        )
         if not has_value:
             return
 

diff --git a/src/black/mode.py b/src/black/mode.py
@@ -46,6 +46,7 @@ class Feature(Enum):
     DEBUG_F_STRINGS = 16
     PARENTHESIZED_CONTEXT_MANAGERS = 17
     TYPE_PARAMS = 18
+    FSTRING_PARSING = 19
     FORCE_OPTIONAL_PARENTHESES = 50
 
     # __future__ flags
@@ -156,6 +157,7 @@ class Feature(Enum):
         Feature.EXCEPT_STAR,
         Feature.VARIADIC_GENERICS,
         Feature.TYPE_PARAMS,
+        Feature.FSTRING_PARSING,
     },
 }
 

diff --git a/src/black/nodes.py b/src/black/nodes.py
@@ -145,7 +145,13 @@
 OPENING_BRACKETS: Final = set(BRACKET.keys())
 CLOSING_BRACKETS: Final = set(BRACKET.values())
 BRACKETS: Final = OPENING_BRACKETS | CLOSING_BRACKETS
-ALWAYS_NO_SPACE: Final = CLOSING_BRACKETS | {token.COMMA, STANDALONE_COMMENT}
+ALWAYS_NO_SPACE: Final = CLOSING_BRACKETS | {
+    token.COMMA,
+    STANDALONE_COMMENT,
+    token.FSTRING_MIDDLE,
+    token.FSTRING_END,
+    token.BANG,
+}
 
 RARROW = 55
 
@@ -211,6 +217,9 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool, mode: Mode) -> str:  # no
     }:
         return NO
 
+    if t == token.LBRACE and p.type == syms.fstring_replacement_field:
+        return NO
+
     prev = leaf.prev_sibling
     if not prev:
         prevp = preceding_leaf(p)
@@ -272,6 +281,9 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool, mode: Mode) -> str:  # no
     elif prev.type in OPENING_BRACKETS:
         return NO
 
+    elif prev.type == token.BANG:
+        return NO
+
     if p.type in {syms.parameters, syms.arglist}:
         # untyped function signatures or calls
         if not prev or prev.type != token.COMMA:
@@ -393,6 +405,7 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool, mode: Mode) -> str:  # no
             elif prevp.type == token.EQUAL and prevp_parent.type == syms.argument:
                 return NO
 
+        # TODO: add fstring here?
         elif t in {token.NAME, token.NUMBER, token.STRING}:
             return NO
 
@@ -542,31 +555,32 @@ def is_arith_like(node: LN) -> bool:
     }
 
 
-def is_docstring(leaf: Leaf, mode: Mode) -> bool:
-    if leaf.type != token.STRING:
-        return False
+def is_docstring(node: NL, mode: Mode) -> bool:
+    if isinstance(node, Leaf):
+        if node.type != token.STRING:
+            return False
 
-    prefix = get_string_prefix(leaf.value)
-    if set(prefix).intersection("bBfF"):
-        return False
+        prefix = get_string_prefix(node.value)
+        if set(prefix).intersection("bBfF"):
+            return False
 
     if (
         Preview.unify_docstring_detection in mode
-        and leaf.parent
-        and leaf.parent.type == syms.simple_stmt
-        and not leaf.parent.prev_sibling
-        and leaf.parent.parent
-        and leaf.parent.parent.type == syms.file_input
+        and node.parent
+        and node.parent.type == syms.simple_stmt
+        and not node.parent.prev_sibling
+        and node.parent.parent
+        and node.parent.parent.type == syms.file_input
     ):
         return True
 
     if prev_siblings_are(
-        leaf.parent, [None, token.NEWLINE, token.INDENT, syms.simple_stmt]
+        node.parent, [None, token.NEWLINE, token.INDENT, syms.simple_stmt]
     ):
         return True
 
     # Multiline docstring on the same line as the `def`.
-    if prev_siblings_are(leaf.parent, [syms.parameters, token.COLON, syms.simple_stmt]):
+    if prev_siblings_are(node.parent, [syms.parameters, token.COLON, syms.simple_stmt]):
         # `syms.parameters` is only used in funcdefs and async_funcdefs in the Python
         # grammar. We're safe to return True without further checks.
         return True
@@ -954,10 +968,6 @@ def is_rpar_token(nl: NL) -> TypeGuard[Leaf]:
     return nl.type == token.RPAR
 
 
-def is_string_token(nl: NL) -> TypeGuard[Leaf]:
-    return nl.type == token.STRING
-
-
 def is_number_token(nl: NL) -> TypeGuard[Leaf]:
     return nl.type == token.NUMBER
 

diff --git a/src/black/strings.py b/src/black/strings.py
@@ -5,7 +5,7 @@
 import re
 import sys
 from functools import lru_cache
-from typing import Final, List, Match, Pattern
+from typing import Final, List, Match, Pattern, Tuple
 
 from black._width_table import WIDTH_TABLE
 from blib2to3.pytree import Leaf
@@ -169,8 +169,7 @@ def _cached_compile(pattern: str) -> Pattern[str]:
 def normalize_string_quotes(s: str) -> str:
     """Prefer double quotes but only if it doesn't cause more escaping.
 
-    Adds or removes backslashes as appropriate. Doesn't parse and fix
-    strings nested in f-strings.
+    Adds or removes backslashes as appropriate.
     """
     value = s.lstrip(STRING_PREFIX_CHARS)
     if value[:3] == '"""':
@@ -211,6 +210,7 @@ def normalize_string_quotes(s: str) -> str:
             s = f"{prefix}{orig_quote}{body}{orig_quote}"
         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
+
     if "f" in prefix.casefold():
         matches = re.findall(
             r"""
@@ -240,6 +240,71 @@ def normalize_string_quotes(s: str) -> str:
     return f"{prefix}{new_quote}{new_body}{new_quote}"
 
 
+def normalize_fstring_quotes(
+    quote: str,
+    middles: List[Leaf],
+    is_raw_fstring: bool,
+) -> Tuple[List[Leaf], str]:
+    """Prefer double quotes but only if it doesn't cause more escaping.
+
+    Adds or removes backslashes as appropriate.
+    """
+    if quote == '"""':
+        return middles, quote
+
+    elif quote == "'''":
+        new_quote = '"""'
+    elif quote == '"':
+        new_quote = "'"
+    else:
+        new_quote = '"'
+
+    unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
+    escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
+    escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){quote}")
+    if is_raw_fstring:
+        for middle in middles:
+            if unescaped_new_quote.search(middle.value):
+                # There's at least one unescaped new_quote in this raw string
+                # so converting is impossible
+                return middles, quote
+
+        # Do not introduce or remove backslashes in raw strings, just use double quote
+        return middles, '"'
+
+    new_segments = []
+    for middle in middles:
+        segment = middle.value
+        # remove unnecessary escapes
+        new_segment = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", segment)
+        if segment != new_segment:
+            # Consider the string without unnecessary escapes as the original
+            middle.value = new_segment
+
+        new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment)
+        new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment)
+        new_segments.append(new_segment)
+
+    if new_quote == '"""' and new_segments[-1].endswith('"'):
+        # edge case:
+        new_segments[-1] = new_segments[-1][:-1] + '\\"'
+
+    for middle, new_segment in zip(middles, new_segments):
+        orig_escape_count = middle.value.count("\\")
+        new_escape_count = new_segment.count("\\")
+
+    if new_escape_count > orig_escape_count:
+        return middles, quote  # Do not introduce more escaping
+
+    if new_escape_count == orig_escape_count and quote == '"':
+        return middles, quote  # Prefer double quotes
+
+    for middle, new_segment in zip(middles, new_segments):
+        middle.value = new_segment
+
+    return middles, new_quote
+
+
 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
     """Replace hex codes in Unicode escape sequences with lowercase representation."""
     text = leaf.value

diff --git a/src/blib2to3/Grammar.txt b/src/blib2to3/Grammar.txt
@@ -163,7 +163,7 @@ atom: ('(' [yield_expr|testlist_gexp] ')' |
        '[' [listmaker] ']' |
        '{' [dictsetmaker] '}' |
        '`' testlist1 '`' |
-       NAME | NUMBER | STRING+ | '.' '.' '.')
+       NAME | NUMBER | (STRING | fstring)+ | '.' '.' '.')
 listmaker: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] )
 testlist_gexp: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] )
 lambdef: 'lambda' [varargslist] ':' test
@@ -254,3 +254,8 @@ case_block: "case" patterns [guard] ':' suite
 guard: 'if' namedexpr_test
 patterns: pattern (',' pattern)* [',']
 pattern: (expr|star_expr) ['as' expr]
+
+fstring: FSTRING_START fstring_middle* FSTRING_END
+fstring_middle: fstring_replacement_field | FSTRING_MIDDLE
+fstring_replacement_field: '{' (yield_expr | testlist_star_expr) ['='] [ "!" NAME ] [ ':' fstring_format_spec* ] '}'
+fstring_format_spec: FSTRING_MIDDLE | fstring_replacement_field
diff --git a/src/blib2to3/pgen2/driver.py b/src/blib2to3/pgen2/driver.py
@@ -167,7 +167,9 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) ->
             if type in {token.INDENT, token.DEDENT}:
                 prefix = _prefix
             lineno, column = end
-            if value.endswith("\n"):
+            # FSTRING_MIDDLE is the only token that can end with a newline, and
+            # `end` will point to the next line. For that case, don't increment lineno.
+            if value.endswith("\n") and type != token.FSTRING_MIDDLE:
                 lineno += 1
                 column = 0
         else:

diff --git a/src/blib2to3/pgen2/grammar.py b/src/blib2to3/pgen2/grammar.py
@@ -218,6 +218,7 @@ def report(self) -> None:
 //= DOUBLESLASHEQUAL
 -> RARROW
 := COLONEQUAL
+! BANG
 """
 
 opmap = {}