From 48ad67cabb3426ec8ce8c2d28953a273c6f3bb8a Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sat, 29 Jul 2023 23:45:56 +0530 Subject: [PATCH 01/77] Add PEP701 support --- src/blib2to3/pgen2/token.py | 5 +- src/blib2to3/pgen2/tokenize.py | 113 ++++++++++++++++++++++++++++++--- 2 files changed, 109 insertions(+), 9 deletions(-) diff --git a/src/blib2to3/pgen2/token.py b/src/blib2to3/pgen2/token.py index ed2fc4e85fc..761cc1c7e88 100644 --- a/src/blib2to3/pgen2/token.py +++ b/src/blib2to3/pgen2/token.py @@ -66,7 +66,10 @@ ASYNC: Final = 57 ERRORTOKEN: Final = 58 COLONEQUAL: Final = 59 -N_TOKENS: Final = 60 +FSTRING_START: Final = 60 +FSTRING_MIDDLE: Final = 61 +FSTRING_END: Final = 62 +N_TOKENS: Final = 63 NT_OFFSET: Final = 256 # --end constants-- diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index d0607f4b1e1..f02c76284c1 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -27,6 +27,7 @@ function to which the 5 fields described above are passed as 5 arguments, each time a new token is found.""" +import io import sys from typing import ( Callable, @@ -57,6 +58,11 @@ NUMBER, OP, STRING, + LBRACE, + RBRACE, + FSTRING_START, + FSTRING_MIDDLE, + FSTRING_END, tok_name, ) @@ -66,7 +72,7 @@ import re from codecs import BOM_UTF8, lookup -from . import token +from blib2to3.pgen2 import token __all__ = [x for x in dir(token) if x[0] != "_"] + [ "tokenize", @@ -468,10 +474,12 @@ def generate_tokens( raise TokenError("EOF in multi-line string", strstart) endmatch = endprog.match(line) if endmatch: + endquote = endmatch.group(0) pos = end = endmatch.end(0) - yield ( - STRING, + yield from tokenize_string( contstr + line[:end], + startquote, + endquote, strstart, (lnum, end), contline + line, @@ -590,15 +598,19 @@ def generate_tokens( stashed = None yield (COMMENT, token, spos, epos, line) elif token in triple_quoted: - endprog = endprogs[token] + startquote = token + endprog = endprogs[startquote] endmatch = endprog.match(line, pos) if endmatch: # all on one line + endquote = endmatch.group(0) pos = endmatch.end(0) token = line[start:pos] if stashed: yield stashed stashed = None - yield (STRING, token, spos, (lnum, pos), line) + yield from tokenize_string( + token, startquote, endquote, spos, (lnum, pos), line + ) else: strstart = (lnum, start) # multiple lines contstr = line[start:] @@ -627,7 +639,18 @@ def generate_tokens( if stashed: yield stashed stashed = None - yield (STRING, token, spos, epos, line) + + if initial in single_quoted: + startquote = initial + elif token[:2] in single_quoted: + startquote = token[:2] + else: + startquote = token[:3] + + endquote = token[-1] + yield from tokenize_string( + token, startquote, endquote, spos, epos, line + ) elif initial.isidentifier(): # ordinary name if token in ("async", "await"): if async_keywords or async_def: @@ -694,8 +717,82 @@ def generate_tokens( yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") +def tokenize_string( + string: str, + startquote: str, + endquote: str, + startpos: Coord, + endpos: Coord, + line: str, +) -> GoodTokenInfo: + if not string.startswith(("f", "F")): + # regular strings can still be returned as usual + yield (STRING, string, startpos, endpos, line) + return + + lnum = startpos[0] + yield (FSTRING_START, startquote, startpos, (lnum, len(startquote)), line) + pos = len(startquote) + max = len(string) - len(endquote) + while pos < max: + opening_bracket_index = string.find("{", pos) + if opening_bracket_index == -1: + string_part = string[pos:max] + yield (FSTRING_MIDDLE, string_part, (lnum, pos), (lnum, max), line) + pos = max + else: + string_part = string[pos:opening_bracket_index] + yield ( + FSTRING_MIDDLE, + string_part, + (lnum, pos), + (lnum, opening_bracket_index), + line, + ) + yield ( + LBRACE, + "{", + (lnum, opening_bracket_index), + (lnum, opening_bracket_index + 1), + line, + ) + pos = opening_bracket_index + 1 + + # TODO: skip over {{ + if pos < max: + inner_source = string[pos:max] + curly_brace_level = 1 + startpos = pos + for token in generate_tokens(io.StringIO(inner_source).readline): + pos = startpos + token[3][1] + + if token[0] == OP and token[1] == "{": + curly_brace_level += 1 + elif token[0] == OP and token[1] == "}": + curly_brace_level -= 1 + + if curly_brace_level == 0: + yield ( + RBRACE, + "}", + (lnum, pos), + (lnum, pos + 1), + line, + ) + break + + token_with_updated_pos = ( + token[0], + token[1], + (token[2][0], startpos + token[2][1]), + (token[3][0], startpos + token[3][1]), + token[4], + ) + yield token_with_updated_pos + + yield (FSTRING_END, endquote, (lnum, max), endpos, line) + + if __name__ == "__main__": # testing if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) - else: - tokenize(sys.stdin.readline) From 175942b906da22583821c86369de5d5fe9cf38e3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 29 Jul 2023 18:18:29 +0000 Subject: [PATCH 02/77] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/blib2to3/pgen2/tokenize.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index f02c76284c1..0e8815e9da5 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -51,18 +51,18 @@ DEDENT, ENDMARKER, ERRORTOKEN, + FSTRING_END, + FSTRING_MIDDLE, + FSTRING_START, INDENT, + LBRACE, NAME, NEWLINE, NL, NUMBER, OP, - STRING, - LBRACE, RBRACE, - FSTRING_START, - FSTRING_MIDDLE, - FSTRING_END, + STRING, tok_name, ) From 9e344f43283024d911a5bab1d0e8cb93c35aae49 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 14 Aug 2023 00:59:11 +0530 Subject: [PATCH 03/77] Add FSTRING_START and FSTRING_MIDDLE tokenizing --- src/blib2to3/pgen2/tokenize.py | 178 ++++++++++++--------------------- 1 file changed, 63 insertions(+), 115 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 0e8815e9da5..ea90cccd8c6 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -27,7 +27,6 @@ function to which the 5 fields described above are passed as 5 arguments, each time a new token is found.""" -import io import sys from typing import ( Callable, @@ -61,7 +60,6 @@ NL, NUMBER, OP, - RBRACE, STRING, tok_name, ) @@ -72,7 +70,7 @@ import re from codecs import BOM_UTF8, lookup -from blib2to3.pgen2 import token +from . import token __all__ = [x for x in dir(token) if x[0] != "_"] + [ "tokenize", @@ -127,13 +125,12 @@ def _combinations(*l: str) -> Set[str]: Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" # Tail end of """ string. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?" +_litprefix = r"(?:[uUrRbB]|[rR][bB]|[bBuU][rR])?" +_fstringlitprefix = r"(?:rF|FR|Fr|fr|RF|F|rf|f|Rf|fR)" Triple = group(_litprefix + "'''", _litprefix + '"""') -# Single-line ' or " string. -String = group( - _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", - _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"', -) + +SingleLbrace = r"[^{\\]*(?:\\.[^{\\]*)*{" +DoubleLbrace = r"[^{\\]*(?:\\.[^{\\]*)*{" # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get @@ -155,41 +152,57 @@ def _combinations(*l: str) -> Set[str]: Funny = group(Operator, Bracket, Special) # First (or only) line of ' or " string. +# TODO: handle escaping `{{` ContStr = group( _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"), _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"), + rf"({_fstringlitprefix}')[^\n'\\{{]*(?:\\.[^\n'\\{{]*)*" + + group("'", "{", r"\\\r?\n"), + rf'({_fstringlitprefix}")[^\n"\\{{]*(?:\\.[^\n"\\{{]*)*' + + group('"', "{", r"\\\r?\n"), ) PseudoExtras = group(r"\\\r?\n", Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) pseudoprog: Final = re.compile(PseudoToken, re.UNICODE) + +singleprog = re.compile(Single) +singleprog_plus_lbrace = re.compile(group(SingleLbrace, Single)) +doubleprog = re.compile(Double) +doubleprog_plus_lbrace = re.compile(group(DoubleLbrace, Double)) + single3prog = re.compile(Single3) +single3prog_plus_lbrace = re.compile(group(SingleLbrace, Single3)) double3prog = re.compile(Double3) +double3prog_plus_lbrace = re.compile(group(DoubleLbrace, Double3)) -_strprefixes = ( - _combinations("r", "R", "f", "F") - | _combinations("r", "R", "b", "B") - | {"u", "U", "ur", "uR", "Ur", "UR"} -) +_strprefixes = _combinations("r", "R", "b", "B") | {"u", "U", "ur", "uR", "Ur", "UR"} +_fstring_prefixes = _combinations("r", "R", "f", "F") - {"r", "R"} endprogs: Final = { - "'": re.compile(Single), - '"': re.compile(Double), + "'": singleprog, + '"': doubleprog, "'''": single3prog, '"""': double3prog, + **{f"{prefix}'": singleprog for prefix in _strprefixes}, + **{f'{prefix}"': doubleprog for prefix in _strprefixes}, + **{f"{prefix}'": singleprog_plus_lbrace for prefix in _fstring_prefixes}, + **{f'{prefix}"': doubleprog_plus_lbrace for prefix in _fstring_prefixes}, **{f"{prefix}'''": single3prog for prefix in _strprefixes}, **{f'{prefix}"""': double3prog for prefix in _strprefixes}, + **{f"{prefix}'''": single3prog_plus_lbrace for prefix in _fstring_prefixes}, + **{f'{prefix}"""': double3prog_plus_lbrace for prefix in _fstring_prefixes}, } triple_quoted: Final = ( {"'''", '"""'} - | {f"{prefix}'''" for prefix in _strprefixes} - | {f'{prefix}"""' for prefix in _strprefixes} + | {f"{prefix}'''" for prefix in _strprefixes | _fstring_prefixes} + | {f'{prefix}"""' for prefix in _strprefixes | _fstring_prefixes} ) single_quoted: Final = ( {"'", '"'} - | {f"{prefix}'" for prefix in _strprefixes} - | {f'{prefix}"' for prefix in _strprefixes} + | {f"{prefix}'" for prefix in _strprefixes | _fstring_prefixes} + | {f'{prefix}"' for prefix in _strprefixes | _fstring_prefixes} ) tabsize = 8 @@ -474,12 +487,10 @@ def generate_tokens( raise TokenError("EOF in multi-line string", strstart) endmatch = endprog.match(line) if endmatch: - endquote = endmatch.group(0) pos = end = endmatch.end(0) - yield from tokenize_string( + yield ( + STRING, contstr + line[:end], - startquote, - endquote, strstart, (lnum, end), contline + line, @@ -598,19 +609,15 @@ def generate_tokens( stashed = None yield (COMMENT, token, spos, epos, line) elif token in triple_quoted: - startquote = token - endprog = endprogs[startquote] + endprog = endprogs[token] endmatch = endprog.match(line, pos) if endmatch: # all on one line - endquote = endmatch.group(0) pos = endmatch.end(0) token = line[start:pos] if stashed: yield stashed stashed = None - yield from tokenize_string( - token, startquote, endquote, spos, (lnum, pos), line - ) + yield (STRING, token, spos, (lnum, pos), line) else: strstart = (lnum, start) # multiple lines contstr = line[start:] @@ -640,17 +647,32 @@ def generate_tokens( yield stashed stashed = None - if initial in single_quoted: - startquote = initial - elif token[:2] in single_quoted: - startquote = token[:2] + # TODO: move this logic to a function + if not token.endswith("{"): + yield (STRING, token, spos, epos, line) else: - startquote = token[:3] + if pseudomatch[20] is not None: + fstring_start = pseudomatch[20] + offset = pseudomatch.end(20) - pseudomatch.start() + start_epos = (lnum, start + offset) + else: + fstring_start = pseudomatch[22] + offset = pseudomatch.end(22) - pseudomatch.start() + start_epos = (lnum, start + offset - 1) + yield (FSTRING_START, fstring_start, spos, start_epos, line) + end_offset = pseudomatch.end() - 1 + fstring_middle = line[start + offset - 1 : end_offset] + middle_spos = (lnum, start + offset) + middle_epos = (lnum, end_offset + 1) + yield ( + FSTRING_MIDDLE, + fstring_middle, + middle_spos, + middle_epos, + line, + ) + yield (LBRACE, "{", (lnum, end_offset + 1), epos, line) - endquote = token[-1] - yield from tokenize_string( - token, startquote, endquote, spos, epos, line - ) elif initial.isidentifier(): # ordinary name if token in ("async", "await"): if async_keywords or async_def: @@ -717,82 +739,8 @@ def generate_tokens( yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") -def tokenize_string( - string: str, - startquote: str, - endquote: str, - startpos: Coord, - endpos: Coord, - line: str, -) -> GoodTokenInfo: - if not string.startswith(("f", "F")): - # regular strings can still be returned as usual - yield (STRING, string, startpos, endpos, line) - return - - lnum = startpos[0] - yield (FSTRING_START, startquote, startpos, (lnum, len(startquote)), line) - pos = len(startquote) - max = len(string) - len(endquote) - while pos < max: - opening_bracket_index = string.find("{", pos) - if opening_bracket_index == -1: - string_part = string[pos:max] - yield (FSTRING_MIDDLE, string_part, (lnum, pos), (lnum, max), line) - pos = max - else: - string_part = string[pos:opening_bracket_index] - yield ( - FSTRING_MIDDLE, - string_part, - (lnum, pos), - (lnum, opening_bracket_index), - line, - ) - yield ( - LBRACE, - "{", - (lnum, opening_bracket_index), - (lnum, opening_bracket_index + 1), - line, - ) - pos = opening_bracket_index + 1 - - # TODO: skip over {{ - if pos < max: - inner_source = string[pos:max] - curly_brace_level = 1 - startpos = pos - for token in generate_tokens(io.StringIO(inner_source).readline): - pos = startpos + token[3][1] - - if token[0] == OP and token[1] == "{": - curly_brace_level += 1 - elif token[0] == OP and token[1] == "}": - curly_brace_level -= 1 - - if curly_brace_level == 0: - yield ( - RBRACE, - "}", - (lnum, pos), - (lnum, pos + 1), - line, - ) - break - - token_with_updated_pos = ( - token[0], - token[1], - (token[2][0], startpos + token[2][1]), - (token[3][0], startpos + token[3][1]), - token[4], - ) - yield token_with_updated_pos - - yield (FSTRING_END, endquote, (lnum, max), endpos, line) - - if __name__ == "__main__": # testing if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) + else: + tokenize(sys.stdin.readline) From dbdb02c5c1b8d8bcd89b3f5d6ca8ff657d48fea0 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 16 Aug 2023 00:46:13 +0530 Subject: [PATCH 04/77] Support escaping of `{{` --- src/blib2to3/pgen2/tokenize.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index ea90cccd8c6..1158bfc1d29 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -152,14 +152,11 @@ def _combinations(*l: str) -> Set[str]: Funny = group(Operator, Bracket, Special) # First (or only) line of ' or " string. -# TODO: handle escaping `{{` ContStr = group( _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"), _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"), - rf"({_fstringlitprefix}')[^\n'\\{{]*(?:\\.[^\n'\\{{]*)*" - + group("'", "{", r"\\\r?\n"), - rf'({_fstringlitprefix}")[^\n"\\{{]*(?:\\.[^\n"\\{{]*)*' - + group('"', "{", r"\\\r?\n"), + group(_fstringlitprefix + "'") + r"[^\n'\\]*(?:\\.[^\n'\\]*)*({{)(? Date: Wed, 16 Aug 2023 00:50:56 +0530 Subject: [PATCH 05/77] typo --- src/blib2to3/pgen2/tokenize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 1158bfc1d29..19dc35c2b52 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -155,8 +155,8 @@ def _combinations(*l: str) -> Set[str]: ContStr = group( _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"), _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"), - group(_fstringlitprefix + "'") + r"[^\n'\\]*(?:\\.[^\n'\\]*)*({{)(? Date: Sun, 27 Aug 2023 19:08:58 +0530 Subject: [PATCH 06/77] fix some problems with triple quoted strings --- src/blib2to3/pgen2/tokenize.py | 45 ++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 19dc35c2b52..6ce1a579bba 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -127,11 +127,20 @@ def _combinations(*l: str) -> Set[str]: Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' _litprefix = r"(?:[uUrRbB]|[rR][bB]|[bBuU][rR])?" _fstringlitprefix = r"(?:rF|FR|Fr|fr|RF|F|rf|f|Rf|fR)" -Triple = group(_litprefix + "'''", _litprefix + '"""') +Triple = group( + _litprefix + "'''", + _litprefix + '"""', + _fstringlitprefix + '"""', + _fstringlitprefix + "'''", +) +# TODO: these two are the same. remove one SingleLbrace = r"[^{\\]*(?:\\.[^{\\]*)*{" DoubleLbrace = r"[^{\\]*(?:\\.[^{\\]*)*{" +Single3Lbrace = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*{" +Double3Lbrace = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*{' + # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get # recognized as two instances of =). @@ -151,12 +160,16 @@ def _combinations(*l: str) -> Set[str]: Special = group(r"\r?\n", r"[:;.,`@]") Funny = group(Operator, Bracket, Special) +# FSTRING_MIDDLE and LBRACE, inside a single quoted fstring +_fstring_middle_single = r"[^\n'\\]*(?:\\.[^\n'\\]*)*({)(? Set[str]: doubleprog_plus_lbrace = re.compile(group(DoubleLbrace, Double)) single3prog = re.compile(Single3) -single3prog_plus_lbrace = re.compile(group(SingleLbrace, Single3)) +single3prog_plus_lbrace = re.compile(group(Single3Lbrace, Single3)) double3prog = re.compile(Double3) -double3prog_plus_lbrace = re.compile(group(DoubleLbrace, Double3)) +double3prog_plus_lbrace = re.compile(group(Double3Lbrace, Double3)) _strprefixes = _combinations("r", "R", "b", "B") | {"u", "U", "ur", "uR", "Ur", "UR"} _fstring_prefixes = _combinations("r", "R", "f", "F") - {"r", "R"} @@ -609,12 +622,28 @@ def generate_tokens( endprog = endprogs[token] endmatch = endprog.match(line, pos) if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] if stashed: yield stashed stashed = None - yield (STRING, token, spos, (lnum, pos), line) + # TODO: move this logic to a function + # TODO: not how you should identify FSTRING_START + if not token.startswith("f"): + pos = endmatch.end(0) + token = line[start:pos] + yield (STRING, token, spos, epos, line) + else: + # TODO: most of this is wrong + yield (FSTRING_START, token, spos, epos, line) + pos = endmatch.end(0) + token = line[start:pos] + yield ( + FSTRING_MIDDLE, + token, + spos, + epos, + line, + ) + yield (LBRACE, "{", epos, epos, line) else: strstart = (lnum, start) # multiple lines contstr = line[start:] From ee30cde47372e2fe8404ea3fffdd9becc75d1a92 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 27 Aug 2023 20:48:05 +0530 Subject: [PATCH 07/77] Add support for FSTRING_MIDDLE and FSTRING_END --- src/blib2to3/pgen2/tokenize.py | 63 +++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 6ce1a579bba..1a963639ef0 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -60,6 +60,7 @@ NL, NUMBER, OP, + RBRACE, STRING, tok_name, ) @@ -465,7 +466,8 @@ def generate_tokens( and the line on which the token was found. The line passed is the logical line; continuation lines are included. """ - lnum = parenlev = continued = 0 + lnum = parenlev = fstring_level = continued = 0 + inside_fstring_braces = False numchars: Final[str] = "0123456789" contstr, needcont = "", 0 contline: Optional[str] = None @@ -491,7 +493,7 @@ def generate_tokens( lnum += 1 pos, max = 0, len(line) - if contstr: # continued string + if contstr and not inside_fstring_braces: # continued string assert contline is not None if not line: raise TokenError("EOF in multi-line string", strstart) @@ -523,7 +525,8 @@ def generate_tokens( contline = contline + line continue - elif parenlev == 0 and not continued: # new statement + # new statement + elif parenlev == 0 and not continued and not inside_fstring_braces: if not line: break column = 0 @@ -591,6 +594,32 @@ def generate_tokens( continued = 0 while pos < max: + if fstring_level > 0 and not inside_fstring_braces: + endmatch = endprog.match(line, pos) + if endmatch: # all on one line + start, end = endmatch.span(0) + token = line[start:end] + pos = end + # TODO: unsure if this can be safely removed + if stashed: + yield stashed + stashed = None + if not token.endswith("{"): + # TODO: locations + yield (FSTRING_MIDDLE, token, (lnum, 0), (lnum, 0), line) + yield (FSTRING_END, token, (lnum, 0), (lnum, 0), line) + fstring_level -= 1 + else: + # TODO: most of the positions are wrong + yield (FSTRING_MIDDLE, token, (lnum, 0), (lnum, 0), line) + yield (LBRACE, "{", (lnum, 0), (lnum, 0), line) + inside_fstring_braces = True + else: # multiple lines + breakpoint() # TODO: see if the code below is correct + contstr += line + contline += line + break + pseudomatch = pseudoprog.match(line, pos) if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) @@ -632,8 +661,9 @@ def generate_tokens( token = line[start:pos] yield (STRING, token, spos, epos, line) else: - # TODO: most of this is wrong + # TODO: most of the positions are wrong yield (FSTRING_START, token, spos, epos, line) + fstring_level += 1 pos = endmatch.end(0) token = line[start:pos] yield ( @@ -644,6 +674,7 @@ def generate_tokens( line, ) yield (LBRACE, "{", epos, epos, line) + inside_fstring_braces = True else: strstart = (lnum, start) # multiple lines contstr = line[start:] @@ -654,17 +685,17 @@ def generate_tokens( or token[:2] in single_quoted or token[:3] in single_quoted ): + maybe_endprog = ( + endprogs.get(initial) + or endprogs.get(token[1]) + or endprogs.get(token[2]) + ) + assert ( + maybe_endprog is not None + ), f"endprog not found for {token}" + endprog = maybe_endprog if token[-1] == "\n": # continued string strstart = (lnum, start) - maybe_endprog = ( - endprogs.get(initial) - or endprogs.get(token[1]) - or endprogs.get(token[2]) - ) - assert ( - maybe_endprog is not None - ), f"endprog not found for {token}" - endprog = maybe_endprog contstr, needcont = line[start:], 1 contline = line break @@ -686,6 +717,8 @@ def generate_tokens( offset = pseudomatch.end(22) - pseudomatch.start() start_epos = (lnum, start + offset - 1) yield (FSTRING_START, fstring_start, spos, start_epos, line) + fstring_level += 1 + end_offset = pseudomatch.end() - 1 fstring_middle = line[start + offset - 1 : end_offset] middle_spos = (lnum, start + offset) @@ -698,6 +731,7 @@ def generate_tokens( line, ) yield (LBRACE, "{", (lnum, end_offset + 1), epos, line) + inside_fstring_braces = True elif initial.isidentifier(): # ordinary name if token in ("async", "await"): @@ -743,6 +777,9 @@ def generate_tokens( stashed = None yield (NL, token, spos, (lnum, pos), line) continued = 1 + elif initial == '}' and parenlev == 0 and inside_fstring_braces: + inside_fstring_braces = False + yield (RBRACE, token, spos, epos, line) else: if initial in "([{": parenlev += 1 From e7b58500b91d00b55f4339d20d4bc72862d2d0f2 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 29 Aug 2023 18:35:51 +0530 Subject: [PATCH 08/77] bugfix and simplify the regexes --- src/blib2to3/pgen2/tokenize.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 1a963639ef0..4ae4324074f 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -135,12 +135,11 @@ def _combinations(*l: str) -> Set[str]: _fstringlitprefix + "'''", ) -# TODO: these two are the same. remove one -SingleLbrace = r"[^{\\]*(?:\\.[^{\\]*)*{" -DoubleLbrace = r"[^{\\]*(?:\\.[^{\\]*)*{" +SingleLbrace = r"[^'\\{]*(?:(?:\\.|{{)[^'\\{]*)*{" +DoubleLbrace = r'[^"\\{]*(?:(?:\\.|{{)[^"\\{]*)*{' -Single3Lbrace = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*{" -Double3Lbrace = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*{' +Single3Lbrace = r"[^'\\{]*(?:(?:\\.|{{|'(?!''))[^'\\{]*)*{" +Double3Lbrace = r'[^"\\{]*(?:(?:\\.|{{|"(?!""))[^"\\{]*)*{' # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get @@ -162,8 +161,8 @@ def _combinations(*l: str) -> Set[str]: Funny = group(Operator, Bracket, Special) # FSTRING_MIDDLE and LBRACE, inside a single quoted fstring -_fstring_middle_single = r"[^\n'\\]*(?:\\.[^\n'\\]*)*({)(? Date: Wed, 6 Sep 2023 15:51:30 +0530 Subject: [PATCH 09/77] Fix small regex problems --- src/blib2to3/pgen2/tokenize.py | 44 +++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 4ae4324074f..fdb5f4e74cf 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -135,11 +135,11 @@ def _combinations(*l: str) -> Set[str]: _fstringlitprefix + "'''", ) -SingleLbrace = r"[^'\\{]*(?:(?:\\.|{{)[^'\\{]*)*{" -DoubleLbrace = r'[^"\\{]*(?:(?:\\.|{{)[^"\\{]*)*{' +SingleLbrace = r"[^'\\{]*(?:(?:\\.|{{)[^'\\{]*)*{(?!{)" +DoubleLbrace = r'[^"\\{]*(?:(?:\\.|{{)[^"\\{]*)*{(?!{)' -Single3Lbrace = r"[^'\\{]*(?:(?:\\.|{{|'(?!''))[^'\\{]*)*{" -Double3Lbrace = r'[^"\\{]*(?:(?:\\.|{{|"(?!""))[^"\\{]*)*{' +Single3Lbrace = r"[^'\\{]*(?:(?:\\.|{{|'(?!''))[^'\\{]*)*{(?!{)" +Double3Lbrace = r'[^"\\{]*(?:(?:\\.|{{|"(?!""))[^"\\{]*)*{(?!{)' # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get @@ -499,13 +499,24 @@ def generate_tokens( endmatch = endprog.match(line) if endmatch: pos = end = endmatch.end(0) - yield ( - STRING, - contstr + line[:end], - strstart, - (lnum, end), - contline + line, - ) + token = contstr + line[:end] + spos = strstart + epos = (lnum, end) + tokenline = contline + line + # TODO: better way to detect fstring + if fstring_level == 0: + yield (STRING, token, spos, epos, tokenline) + else: + # TODO: positions are all wrong + yield (FSTRING_MIDDLE, token, spos, epos, tokenline) + if token.endswith("{"): + yield (LBRACE, "{", spos, epos, tokenline) + inside_fstring_braces = True + else: + yield (FSTRING_END, token, spos, epos, tokenline) + fstring_level -= 1 + # TODO: contstr reliance doesn't work now because we can be inside + # an fstring and still empty contstr right here. contstr, needcont = "", 0 contline = None elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n": @@ -614,9 +625,8 @@ def generate_tokens( yield (LBRACE, "{", (lnum, 0), (lnum, 0), line) inside_fstring_braces = True else: # multiple lines - breakpoint() # TODO: see if the code below is correct contstr += line - contline += line + contline = line break pseudomatch = pseudoprog.match(line, pos) @@ -647,6 +657,10 @@ def generate_tokens( stashed = None yield (COMMENT, token, spos, epos, line) elif token in triple_quoted: + if token.startswith("f"): + yield (FSTRING_START, token, spos, epos, line) + fstring_level += 1 + endprog = endprogs[token] endmatch = endprog.match(line, pos) if endmatch: # all on one line @@ -661,8 +675,6 @@ def generate_tokens( yield (STRING, token, spos, epos, line) else: # TODO: most of the positions are wrong - yield (FSTRING_START, token, spos, epos, line) - fstring_level += 1 pos = endmatch.end(0) token = line[start:pos] yield ( @@ -775,8 +787,8 @@ def generate_tokens( yield (NL, token, spos, (lnum, pos), line) continued = 1 elif initial == "}" and parenlev == 0 and inside_fstring_braces: - inside_fstring_braces = False yield (RBRACE, token, spos, epos, line) + inside_fstring_braces = False else: if initial in "([{": parenlev += 1 From c1ecc146f84682276f2490838f5c1b39aeb88197 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Thu, 7 Sep 2023 00:28:15 +0530 Subject: [PATCH 10/77] fix newline type --- src/blib2to3/pgen2/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index fdb5f4e74cf..95dd6aa63b6 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -641,7 +641,7 @@ def generate_tokens( yield (NUMBER, token, spos, epos, line) elif initial in "\r\n": newline = NEWLINE - if parenlev > 0: + if parenlev > 0 or inside_fstring_braces: newline = NL elif async_def: async_def_nl = True From 644c5cc05906effef9c77699541776cc71906612 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 10 Sep 2023 15:12:39 +0530 Subject: [PATCH 11/77] turn endprog into endprog_stack --- src/blib2to3/pgen2/tokenize.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 95dd6aa63b6..396b21a5c33 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -482,7 +482,7 @@ def generate_tokens( async_def_nl = False strstart: Tuple[int, int] - endprog: Pattern[str] + endprog_stack: list[Pattern[str]] = [] while 1: # loop over lines in stream try: @@ -496,6 +496,7 @@ def generate_tokens( assert contline is not None if not line: raise TokenError("EOF in multi-line string", strstart) + endprog = endprog_stack[-1] endmatch = endprog.match(line) if endmatch: pos = end = endmatch.end(0) @@ -515,6 +516,7 @@ def generate_tokens( else: yield (FSTRING_END, token, spos, epos, tokenline) fstring_level -= 1 + endprog_stack.pop() # TODO: contstr reliance doesn't work now because we can be inside # an fstring and still empty contstr right here. contstr, needcont = "", 0 @@ -605,6 +607,7 @@ def generate_tokens( while pos < max: if fstring_level > 0 and not inside_fstring_braces: + endprog = endprog_stack[-1] endmatch = endprog.match(line, pos) if endmatch: # all on one line start, end = endmatch.span(0) @@ -619,6 +622,7 @@ def generate_tokens( yield (FSTRING_MIDDLE, token, (lnum, 0), (lnum, 0), line) yield (FSTRING_END, token, (lnum, 0), (lnum, 0), line) fstring_level -= 1 + endprog_stack.pop() else: # TODO: most of the positions are wrong yield (FSTRING_MIDDLE, token, (lnum, 0), (lnum, 0), line) @@ -657,11 +661,12 @@ def generate_tokens( stashed = None yield (COMMENT, token, spos, epos, line) elif token in triple_quoted: + endprog = endprogs[token] if token.startswith("f"): yield (FSTRING_START, token, spos, epos, line) fstring_level += 1 + endprog_stack.append(endprog) - endprog = endprogs[token] endmatch = endprog.match(line, pos) if endmatch: # all on one line if stashed: @@ -704,6 +709,7 @@ def generate_tokens( assert maybe_endprog is not None, f"endprog not found for {token}" endprog = maybe_endprog if token[-1] == "\n": # continued string + endprog_stack.append(endprog) strstart = (lnum, start) contstr, needcont = line[start:], 1 contline = line @@ -727,6 +733,8 @@ def generate_tokens( start_epos = (lnum, start + offset - 1) yield (FSTRING_START, fstring_start, spos, start_epos, line) fstring_level += 1 + endprog = endprogs[fstring_start] + endprog_stack.append(endprog) end_offset = pseudomatch.end() - 1 fstring_middle = line[start + offset - 1 : end_offset] From b23cdfd07242173fbdcc3c267b9a7ae573f11fb6 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 12 Sep 2023 00:11:57 +0530 Subject: [PATCH 12/77] Support fstrings with no braces --- src/blib2to3/pgen2/tokenize.py | 56 ++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 396b21a5c33..95eba97a53e 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -160,16 +160,21 @@ def _combinations(*l: str) -> Set[str]: Special = group(r"\r?\n", r"[:;.,`@]") Funny = group(Operator, Bracket, Special) +_string_middle_single = r"[^\n'\\]*(?:\\.[^\n'\\]*)*" +_string_middle_double = r'[^\n"\\]*(?:\\.[^\n"\\]*)*' + # FSTRING_MIDDLE and LBRACE, inside a single quoted fstring -_fstring_middle_single = r"[^\n'\\{]*(?:(?:\\.|{{)[^\n'\\{]*)*({)" -_fstring_middle_double = r'[^\n"\\{]*(?:(?:\\.|{{)[^\n"\\{]*)*({)' +_fstring_middle_single = r"[^\n'\\{]*(?:(?:\\.|{{)[^\n'\\{]*)*({)(?!{)" +_fstring_middle_double = r'[^\n"\\{]*(?:(?:\\.|{{)[^\n"\\{]*)*({)(?!{)' # First (or only) line of ' or " string. ContStr = group( - _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"), - _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"), + _litprefix + "'" + _string_middle_single + group("'", r"\\\r?\n"), + _litprefix + '"' + _string_middle_double + group('"', r"\\\r?\n"), group(_fstringlitprefix + "'") + _fstring_middle_single, group(_fstringlitprefix + '"') + _fstring_middle_double, + group(_fstringlitprefix + "'") + _string_middle_single + group("'", r"\\\r?\n"), + group(_fstringlitprefix + '"') + _string_middle_double + group('"', r"\\\r?\n"), ) PseudoExtras = group(r"\\\r?\n", Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) @@ -492,6 +497,13 @@ def generate_tokens( lnum += 1 pos, max = 0, len(line) + + # TODO: probably inside_fstring_braces is not the best boolean. + # what about a case of a string inside a multiline fstring inside a + # multiline fstring?? + # for eg. this doesn't work right now: f"{f'{2+2}'}" + # because inside_fstring_braces gets set to false after the first `}` + # print(f'{parenlev = } {continued = } {inside_fstring_braces = }') if contstr and not inside_fstring_braces: # continued string assert contline is not None if not line: @@ -514,7 +526,7 @@ def generate_tokens( yield (LBRACE, "{", spos, epos, tokenline) inside_fstring_braces = True else: - yield (FSTRING_END, token, spos, epos, tokenline) + yield (FSTRING_END, token[-1], spos, epos, tokenline) fstring_level -= 1 endprog_stack.pop() # TODO: contstr reliance doesn't work now because we can be inside @@ -620,7 +632,7 @@ def generate_tokens( if not token.endswith("{"): # TODO: locations yield (FSTRING_MIDDLE, token, (lnum, 0), (lnum, 0), line) - yield (FSTRING_END, token, (lnum, 0), (lnum, 0), line) + yield (FSTRING_END, token[-1], (lnum, 0), (lnum, 0), line) fstring_level -= 1 endprog_stack.pop() else: @@ -689,8 +701,14 @@ def generate_tokens( epos, line, ) - yield (LBRACE, "{", epos, epos, line) - inside_fstring_braces = True + if not token.endswith("{"): + yield (FSTRING_END, token[-1], epos, epos, line) + fstring_level -= 1 + endprog_stack.pop() + else: + # TODO: most of the positions are wrong + yield (LBRACE, "{", epos, epos, line) + inside_fstring_braces = True else: strstart = (lnum, start) # multiple lines contstr = line[start:] @@ -720,17 +738,25 @@ def generate_tokens( stashed = None # TODO: move this logic to a function - if not token.endswith("{"): + if not token.startswith("f"): yield (STRING, token, spos, epos, line) else: if pseudomatch[20] is not None: fstring_start = pseudomatch[20] offset = pseudomatch.end(20) - pseudomatch.start() start_epos = (lnum, start + offset) - else: + elif pseudomatch[22] is not None: fstring_start = pseudomatch[22] offset = pseudomatch.end(22) - pseudomatch.start() start_epos = (lnum, start + offset - 1) + elif pseudomatch[24] is not None: + fstring_start = pseudomatch[24] + offset = pseudomatch.end(24) - pseudomatch.start() + start_epos = (lnum, start + offset - 1) + else: + fstring_start = pseudomatch[26] + offset = pseudomatch.end(26) - pseudomatch.start() + start_epos = (lnum, start + offset - 1) yield (FSTRING_START, fstring_start, spos, start_epos, line) fstring_level += 1 endprog = endprogs[fstring_start] @@ -747,8 +773,14 @@ def generate_tokens( middle_epos, line, ) - yield (LBRACE, "{", (lnum, end_offset + 1), epos, line) - inside_fstring_braces = True + if not token.endswith("{"): + yield (FSTRING_END, token[-1], epos, epos, line) + fstring_level -= 1 + endprog_stack.pop() + else: + # TODO: most of the positions are wrong + yield (LBRACE, "{", epos, epos, line) + inside_fstring_braces = True elif initial.isidentifier(): # ordinary name if token in ("async", "await"): From bbbac0abf529ed5854f777f060bbbee4bdbfa3e4 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 18 Sep 2023 02:52:48 +0530 Subject: [PATCH 13/77] Add grammar changes --- src/blib2to3/Grammar.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/blib2to3/Grammar.txt b/src/blib2to3/Grammar.txt index e48e66363fb..0b369ddd9ff 100644 --- a/src/blib2to3/Grammar.txt +++ b/src/blib2to3/Grammar.txt @@ -166,7 +166,7 @@ atom: ('(' [yield_expr|testlist_gexp] ')' | '[' [listmaker] ']' | '{' [dictsetmaker] '}' | '`' testlist1 '`' | - NAME | NUMBER | STRING+ | '.' '.' '.') + NAME | NUMBER | STRING+ | fstring+ | '.' '.' '.') listmaker: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] ) testlist_gexp: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] ) lambdef: 'lambda' [varargslist] ':' test @@ -257,3 +257,7 @@ case_block: "case" patterns [guard] ':' suite guard: 'if' namedexpr_test patterns: pattern (',' pattern)* [','] pattern: (expr|star_expr) ['as' expr] + +fstring: FSTRING_START fstring_middle* FSTRING_END +fstring_middle: fstring_replacement_field | FSTRING_MIDDLE +fstring_replacement_field: '{' (yield_expr | testlist_star_expr) '}' From dadaa64d4efbb9c246e96a6cc6f3ae9c40910006 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 18 Sep 2023 19:55:32 +0530 Subject: [PATCH 14/77] fix some locations --- src/blib2to3/pgen2/tokenize.py | 43 +++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 95eba97a53e..276ccab531f 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -497,7 +497,6 @@ def generate_tokens( lnum += 1 pos, max = 0, len(line) - # TODO: probably inside_fstring_braces is not the best boolean. # what about a case of a string inside a multiline fstring inside a # multiline fstring?? @@ -624,22 +623,35 @@ def generate_tokens( if endmatch: # all on one line start, end = endmatch.span(0) token = line[start:end] - pos = end + # TODO: triple quotes + middle_token, end_token = token[:-1], token[-1] # TODO: unsure if this can be safely removed if stashed: yield stashed stashed = None + yield ( + FSTRING_MIDDLE, + middle_token, + (lnum, pos), + (lnum, end - 1), + line, + ) if not token.endswith("{"): - # TODO: locations - yield (FSTRING_MIDDLE, token, (lnum, 0), (lnum, 0), line) - yield (FSTRING_END, token[-1], (lnum, 0), (lnum, 0), line) + # TODO: end-1 is probably wrong + yield ( + FSTRING_END, + end_token, + (lnum, end - 1), + (lnum, end), + line, + ) fstring_level -= 1 endprog_stack.pop() else: # TODO: most of the positions are wrong - yield (FSTRING_MIDDLE, token, (lnum, 0), (lnum, 0), line) yield (LBRACE, "{", (lnum, 0), (lnum, 0), line) inside_fstring_braces = True + pos = end else: # multiple lines contstr += line contline = line @@ -748,24 +760,24 @@ def generate_tokens( elif pseudomatch[22] is not None: fstring_start = pseudomatch[22] offset = pseudomatch.end(22) - pseudomatch.start() - start_epos = (lnum, start + offset - 1) + start_epos = (lnum, start + offset) elif pseudomatch[24] is not None: fstring_start = pseudomatch[24] offset = pseudomatch.end(24) - pseudomatch.start() - start_epos = (lnum, start + offset - 1) + start_epos = (lnum, start + offset) else: fstring_start = pseudomatch[26] offset = pseudomatch.end(26) - pseudomatch.start() - start_epos = (lnum, start + offset - 1) + start_epos = (lnum, start + offset) yield (FSTRING_START, fstring_start, spos, start_epos, line) fstring_level += 1 endprog = endprogs[fstring_start] endprog_stack.append(endprog) end_offset = pseudomatch.end() - 1 - fstring_middle = line[start + offset - 1 : end_offset] + fstring_middle = line[start + offset : end_offset] middle_spos = (lnum, start + offset) - middle_epos = (lnum, end_offset + 1) + middle_epos = (lnum, end_offset) yield ( FSTRING_MIDDLE, fstring_middle, @@ -774,12 +786,15 @@ def generate_tokens( line, ) if not token.endswith("{"): - yield (FSTRING_END, token[-1], epos, epos, line) + end_spos = (lnum, end_offset) + end_epos = (lnum, end_offset + 1) + yield (FSTRING_END, token[-1], end_spos, end_epos, line) fstring_level -= 1 endprog_stack.pop() else: - # TODO: most of the positions are wrong - yield (LBRACE, "{", epos, epos, line) + end_spos = (lnum, end_offset) + end_epos = (lnum, end_offset + 1) + yield (LBRACE, "{", end_spos, end_epos, line) inside_fstring_braces = True elif initial.isidentifier(): # ordinary name From a57e404c84b9b45158974e5774a0ed8ceb0894ce Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 18 Sep 2023 23:43:40 +0530 Subject: [PATCH 15/77] remove padding from fstring_middle and fstring_end --- src/black/nodes.py | 10 +++++++++- src/blib2to3/pygram.py | 3 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/black/nodes.py b/src/black/nodes.py index 45423b2596b..3f9f0a9be2e 100644 --- a/src/black/nodes.py +++ b/src/black/nodes.py @@ -131,7 +131,12 @@ OPENING_BRACKETS: Final = set(BRACKET.keys()) CLOSING_BRACKETS: Final = set(BRACKET.values()) BRACKETS: Final = OPENING_BRACKETS | CLOSING_BRACKETS -ALWAYS_NO_SPACE: Final = CLOSING_BRACKETS | {token.COMMA, STANDALONE_COMMENT} +ALWAYS_NO_SPACE: Final = CLOSING_BRACKETS | { + token.COMMA, + STANDALONE_COMMENT, + token.FSTRING_MIDDLE, + token.FSTRING_END, +} RARROW = 55 @@ -197,6 +202,9 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool) -> str: # noqa: C901 }: return NO + if t == token.LBRACE and p.type == syms.fstring_replacement_field: + return NO + prev = leaf.prev_sibling if not prev: prevp = preceding_leaf(p) diff --git a/src/blib2to3/pygram.py b/src/blib2to3/pygram.py index c30c630e816..8c93e4ddb13 100644 --- a/src/blib2to3/pygram.py +++ b/src/blib2to3/pygram.py @@ -71,6 +71,9 @@ class _python_symbols(Symbols): file_input: int flow_stmt: int for_stmt: int + fstring: int + fstring_middle: int + fstring_replacement_field: int funcdef: int global_stmt: int guard: int From fff25fbe9b7db2e8a383f69ab2a76d80bb8d4429 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 19 Sep 2023 00:37:58 +0530 Subject: [PATCH 16/77] Fix some positions --- src/blib2to3/pgen2/tokenize.py | 46 ++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 276ccab531f..2ecfbfdf892 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -624,6 +624,7 @@ def generate_tokens( start, end = endmatch.span(0) token = line[start:end] # TODO: triple quotes + # TODO: check if the token will ever have any whitespace around? middle_token, end_token = token[:-1], token[-1] # TODO: unsure if this can be safely removed if stashed: @@ -703,24 +704,43 @@ def generate_tokens( token = line[start:pos] yield (STRING, token, spos, epos, line) else: - # TODO: most of the positions are wrong - pos = endmatch.end(0) - token = line[start:pos] - yield ( - FSTRING_MIDDLE, - token, - spos, - epos, - line, - ) + end = endmatch.end(0) + token = line[pos:end] + spos, epos = (lnum, pos), (lnum, end) + # TODO: confirm there will be no padding around the tokens + # TODO: don't detect like this perhaps? if not token.endswith("{"): - yield (FSTRING_END, token[-1], epos, epos, line) + fstring_middle, fstring_end = token[:-3], token[-3:] + fstring_middle_epos = fstring_end_spos = (lnum, end - 3) + yield ( + FSTRING_MIDDLE, + fstring_middle, + spos, + fstring_middle_epos, + line, + ) + yield ( + FSTRING_END, + fstring_end, + fstring_end_spos, + epos, + line, + ) fstring_level -= 1 endprog_stack.pop() else: - # TODO: most of the positions are wrong - yield (LBRACE, "{", epos, epos, line) + fstring_middle, lbrace = token[:-1], token[-1] + fstring_middle_epos = lbrace_spos = (lnum, end - 1) + yield ( + FSTRING_MIDDLE, + fstring_middle, + spos, + fstring_middle_epos, + line, + ) + yield (LBRACE, lbrace, lbrace_spos, epos, line) inside_fstring_braces = True + pos = end else: strstart = (lnum, start) # multiple lines contstr = line[start:] From 95cd0bab0b63bf0300f82d4340c7f25d189ff05f Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 19 Sep 2023 01:16:05 +0530 Subject: [PATCH 17/77] fix edge cases with padding --- src/blib2to3/pgen2/tokenize.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 2ecfbfdf892..3ddc44744eb 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -775,26 +775,26 @@ def generate_tokens( else: if pseudomatch[20] is not None: fstring_start = pseudomatch[20] - offset = pseudomatch.end(20) - pseudomatch.start() + offset = pseudomatch.end(20) - pseudomatch.start(1) start_epos = (lnum, start + offset) elif pseudomatch[22] is not None: fstring_start = pseudomatch[22] - offset = pseudomatch.end(22) - pseudomatch.start() + offset = pseudomatch.end(22) - pseudomatch.start(1) start_epos = (lnum, start + offset) elif pseudomatch[24] is not None: fstring_start = pseudomatch[24] - offset = pseudomatch.end(24) - pseudomatch.start() + offset = pseudomatch.end(24) - pseudomatch.start(1) start_epos = (lnum, start + offset) else: fstring_start = pseudomatch[26] - offset = pseudomatch.end(26) - pseudomatch.start() + offset = pseudomatch.end(26) - pseudomatch.start(1) start_epos = (lnum, start + offset) yield (FSTRING_START, fstring_start, spos, start_epos, line) fstring_level += 1 endprog = endprogs[fstring_start] endprog_stack.append(endprog) - end_offset = pseudomatch.end() - 1 + end_offset = pseudomatch.end(1) - 1 fstring_middle = line[start + offset : end_offset] middle_spos = (lnum, start + offset) middle_epos = (lnum, end_offset) From caafa758442d9abfab067b0d2eafd51546d389a2 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 20 Sep 2023 18:52:31 +0530 Subject: [PATCH 18/77] fix nested fstrings bug --- src/blib2to3/pgen2/tokenize.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 3ddc44744eb..3645a5022b9 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -497,12 +497,6 @@ def generate_tokens( lnum += 1 pos, max = 0, len(line) - # TODO: probably inside_fstring_braces is not the best boolean. - # what about a case of a string inside a multiline fstring inside a - # multiline fstring?? - # for eg. this doesn't work right now: f"{f'{2+2}'}" - # because inside_fstring_braces gets set to false after the first `}` - # print(f'{parenlev = } {continued = } {inside_fstring_braces = }') if contstr and not inside_fstring_braces: # continued string assert contline is not None if not line: @@ -861,7 +855,7 @@ def generate_tokens( stashed = None yield (NL, token, spos, (lnum, pos), line) continued = 1 - elif initial == "}" and parenlev == 0 and inside_fstring_braces: + elif initial == "}" and parenlev == 0 and fstring_level > 0: yield (RBRACE, token, spos, epos, line) inside_fstring_braces = False else: From 838f627175f1391646db4942535d73aa6d6dddea Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 20 Sep 2023 22:44:14 +0530 Subject: [PATCH 19/77] Fix bugs in multiline fstrings --- src/blib2to3/pgen2/tokenize.py | 56 ++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 3645a5022b9..075daf667e1 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -504,7 +504,7 @@ def generate_tokens( endprog = endprog_stack[-1] endmatch = endprog.match(line) if endmatch: - pos = end = endmatch.end(0) + end = endmatch.end(0) token = contstr + line[:end] spos = strstart epos = (lnum, end) @@ -512,18 +512,40 @@ def generate_tokens( # TODO: better way to detect fstring if fstring_level == 0: yield (STRING, token, spos, epos, tokenline) + endprog_stack.pop() else: - # TODO: positions are all wrong - yield (FSTRING_MIDDLE, token, spos, epos, tokenline) if token.endswith("{"): - yield (LBRACE, "{", spos, epos, tokenline) + fstring_middle, lbrace = token[:-1], token[-1] + fstring_middle_epos = lbrace_spos = (lnum, end - 1) + yield ( + FSTRING_MIDDLE, + fstring_middle, + spos, + fstring_middle_epos, + line, + ) + yield (LBRACE, lbrace, lbrace_spos, epos, line) inside_fstring_braces = True else: - yield (FSTRING_END, token[-1], spos, epos, tokenline) + # TODO: -3 maybe not guaranteed + fstring_middle, fstring_end = token[:-3], token[-3:] + fstring_middle_epos = end_spos = (lnum, end - 3) + yield ( + FSTRING_MIDDLE, + fstring_middle, + spos, + fstring_middle_epos, + line, + ) + yield ( + FSTRING_END, + fstring_end, + end_spos, + epos, + line, + ) fstring_level -= 1 - endprog_stack.pop() - # TODO: contstr reliance doesn't work now because we can be inside - # an fstring and still empty contstr right here. + pos = end contstr, needcont = "", 0 contline = None elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n": @@ -648,7 +670,8 @@ def generate_tokens( inside_fstring_braces = True pos = end else: # multiple lines - contstr += line + strstart = (lnum, end) + contstr = line[end:] contline = line break @@ -681,10 +704,10 @@ def generate_tokens( yield (COMMENT, token, spos, epos, line) elif token in triple_quoted: endprog = endprogs[token] + endprog_stack.append(endprog) if token.startswith("f"): yield (FSTRING_START, token, spos, epos, line) fstring_level += 1 - endprog_stack.append(endprog) endmatch = endprog.match(line, pos) if endmatch: # all on one line @@ -697,6 +720,7 @@ def generate_tokens( pos = endmatch.end(0) token = line[start:pos] yield (STRING, token, spos, epos, line) + endprog_stack.pop() else: end = endmatch.end(0) token = line[pos:end] @@ -736,8 +760,14 @@ def generate_tokens( inside_fstring_braces = True pos = end else: - strstart = (lnum, start) # multiple lines - contstr = line[start:] + # multiple lines + # TODO: normalize fstring detection + if token.startswith("f"): + strstart = (lnum, pos) + contstr = line[pos:] + else: + strstart = (lnum, start) + contstr = line[start:] contline = line break elif ( @@ -758,7 +788,7 @@ def generate_tokens( contstr, needcont = line[start:], 1 contline = line break - else: # ordinary string + else: # single line string if stashed: yield stashed stashed = None From f5abd4b63b052b460fd2e9fbb436b9ebdea6379e Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Thu, 21 Sep 2023 01:46:38 +0530 Subject: [PATCH 20/77] support fstring_middle ending with newline --- src/blib2to3/pgen2/driver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/driver.py b/src/blib2to3/pgen2/driver.py index e629843f8b9..45109eabf30 100644 --- a/src/blib2to3/pgen2/driver.py +++ b/src/blib2to3/pgen2/driver.py @@ -167,7 +167,9 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> if type in {token.INDENT, token.DEDENT}: prefix = _prefix lineno, column = end - if value.endswith("\n"): + # FSTRING_MIDDLE is the only character that can end with a newline, and + # `end` will point to the next line. For that case, don't increment lineno. + if value.endswith("\n") and type != token.FSTRING_MIDDLE: lineno += 1 column = 0 else: From fd3e5e1caa0ad5f2d8c87f9cda43529698c4c436 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sat, 23 Sep 2023 21:20:02 +0530 Subject: [PATCH 21/77] fix edge case for triple quoted strings --- src/blib2to3/pgen2/tokenize.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 075daf667e1..e3c6da15c78 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -527,7 +527,7 @@ def generate_tokens( yield (LBRACE, lbrace, lbrace_spos, epos, line) inside_fstring_braces = True else: - # TODO: -3 maybe not guaranteed + # TODO: -3 maybe not guaranteed, could be \ separated single line string fstring_middle, fstring_end = token[:-3], token[-3:] fstring_middle_epos = end_spos = (lnum, end - 3) yield ( @@ -639,9 +639,13 @@ def generate_tokens( if endmatch: # all on one line start, end = endmatch.span(0) token = line[start:end] - # TODO: triple quotes # TODO: check if the token will ever have any whitespace around? - middle_token, end_token = token[:-1], token[-1] + if token.endswith(('"""', "'''")): + middle_token, end_token = token[:-3], token[-3:] + middle_epos = end_spos = (lnum, end - 3) + else: + middle_token, end_token = token[:-1], token[-1] + middle_epos = end_spos = (lnum, end - 1) # TODO: unsure if this can be safely removed if stashed: yield stashed @@ -650,15 +654,14 @@ def generate_tokens( FSTRING_MIDDLE, middle_token, (lnum, pos), - (lnum, end - 1), + middle_epos, line, ) if not token.endswith("{"): - # TODO: end-1 is probably wrong yield ( FSTRING_END, end_token, - (lnum, end - 1), + end_spos, (lnum, end), line, ) From 0c6906915f36f8906d57eaba9d25f609a67b0dc0 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sat, 23 Sep 2023 22:55:52 +0530 Subject: [PATCH 22/77] Add string normalization --- src/black/linegen.py | 25 ++++++++++++++++ src/black/strings.py | 69 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 507e860190f..1d4e701ebbb 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -60,6 +60,7 @@ from black.strings import ( fix_docstring, get_string_prefix, + normalize_fstring_quotes, normalize_string_prefix, normalize_string_quotes, normalize_unicode_escape_sequences, @@ -480,6 +481,30 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: yield from self.visit_default(leaf) + def visit_fstring(self, node: Node) -> Iterator[Line]: + """Bunch of hacks here. Needs improvement.""" + fstring_start = node.children[0] + fstring_end = node.children[-1] + + quote_char = fstring_end.value[0] + quote_idx = fstring_start.value.index(quote_char) + prefix, quote = fstring_start.value[:quote_idx], fstring_start.value[quote_idx:] + assert 'f' in prefix or 'F' in prefix + assert quote == fstring_end.value + + is_raw_fstring = 'r' in prefix or 'R' in prefix + middles = [node for node in node.children if node.type == token.FSTRING_MIDDLE] + # if ''.join(m.value for m in middles) == 'foo': + # breakpoint() + + if self.mode.string_normalization: + middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring) + + fstring_start.value = prefix + quote + fstring_end.value = quote + + yield from self.visit_default(node) + def __post_init__(self) -> None: """You are in a twisty little maze of passages.""" self.current_line = Line(mode=self.mode) diff --git a/src/black/strings.py b/src/black/strings.py index 0d30f09ed11..cd6da62b30d 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -173,8 +173,7 @@ def _cached_compile(pattern: str) -> Pattern[str]: def normalize_string_quotes(s: str) -> str: """Prefer double quotes but only if it doesn't cause more escaping. - Adds or removes backslashes as appropriate. Doesn't parse and fix - strings nested in f-strings. + Adds or removes backslashes as appropriate. """ value = s.lstrip(STRING_PREFIX_CHARS) if value[:3] == '"""': @@ -215,6 +214,7 @@ def normalize_string_quotes(s: str) -> str: s = f"{prefix}{orig_quote}{body}{orig_quote}" new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body) new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body) + # TODO: can probably be removed if "f" in prefix.casefold(): matches = re.findall( r""" @@ -243,6 +243,71 @@ def normalize_string_quotes(s: str) -> str: return f"{prefix}{new_quote}{new_body}{new_quote}" +def normalize_fstring_quotes( + quote: str, + middles: list[str], + is_raw_fstring: bool +) -> tuple[str, str]: + """Prefer double quotes but only if it doesn't cause more escaping. + + Adds or removes backslashes as appropriate. + """ + if quote == '"""': + return middles, quote + + elif quote == "'''": + new_quote = '"""' + elif quote == '"': + new_quote = "'" + else: + new_quote = '"' + + unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") + escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") + escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){quote}") + if is_raw_fstring: + for middle in middles: + if unescaped_new_quote.search(middle.value): + # There's at least one unescaped new_quote in this raw string + # so converting is impossible + return middles, quote + + # Do not introduce or remove backslashes in raw strings + return middles, new_quote + + new_segments = [] + for middle in middles: + segment = middle.value + # remove unnecessary escapes + new_segment = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", segment) + if segment != new_segment: + # Consider the string without unnecessary escapes as the original + middle.value = new_segment + + new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment) + new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment) + new_segments.append(new_segment) + + + if new_quote == '"""' and new_segments[-1][-1:] == '"': + # edge case: + new_segments[-1] = new_segments[-1][:-1] + '\\"' + + for middle, new_segment in zip(middles, new_segments): + orig_escape_count = middle.value.count("\\") + new_escape_count = new_segment.count("\\") + + if new_escape_count > orig_escape_count: + return middles, quote # Do not introduce more escaping + + if new_escape_count == orig_escape_count and quote == '"': + return middles, quote # Prefer double quotes + + for middle, new_segment in zip(middles, new_segments): + middle.value = new_segment + + return middles, new_quote + def normalize_unicode_escape_sequences(leaf: Leaf) -> None: """Replace hex codes in Unicode escape sequences with lowercase representation.""" From c4d457e742c5388ead4d9bdeaf795fe6a094a7e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 23 Sep 2023 17:26:14 +0000 Subject: [PATCH 23/77] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/black/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/strings.py b/src/black/strings.py index cd6da62b30d..1300055ccbc 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -283,7 +283,7 @@ def normalize_fstring_quotes( if segment != new_segment: # Consider the string without unnecessary escapes as the original middle.value = new_segment - + new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment) new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment) new_segments.append(new_segment) From ace80e01ccb5d472519cb170348933900a89d8a0 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sat, 23 Sep 2023 23:45:12 +0530 Subject: [PATCH 24/77] small bugfixes --- src/blib2to3/pgen2/tokenize.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index e3c6da15c78..ab846f0ae0d 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -471,6 +471,7 @@ def generate_tokens( logical line; continuation lines are included. """ lnum = parenlev = fstring_level = continued = 0 + parenlev_stack = [] inside_fstring_braces = False numchars: Final[str] = "0123456789" contstr, needcont = "", 0 @@ -513,6 +514,7 @@ def generate_tokens( if fstring_level == 0: yield (STRING, token, spos, epos, tokenline) endprog_stack.pop() + parenlev = parenlev_stack.pop() else: if token.endswith("{"): fstring_middle, lbrace = token[:-1], token[-1] @@ -667,11 +669,12 @@ def generate_tokens( ) fstring_level -= 1 endprog_stack.pop() + parenlev = parenlev_stack.pop() else: - # TODO: most of the positions are wrong - yield (LBRACE, "{", (lnum, 0), (lnum, 0), line) + yield (LBRACE, "{", (lnum, end-1), (lnum, end), line) inside_fstring_braces = True pos = end + continue else: # multiple lines strstart = (lnum, end) contstr = line[end:] @@ -708,7 +711,9 @@ def generate_tokens( elif token in triple_quoted: endprog = endprogs[token] endprog_stack.append(endprog) - if token.startswith("f"): + parenlev_stack.append(parenlev) + parenlev = 0 + if token.startswith(("f", "F")): yield (FSTRING_START, token, spos, epos, line) fstring_level += 1 @@ -719,11 +724,12 @@ def generate_tokens( stashed = None # TODO: move this logic to a function # TODO: not how you should identify FSTRING_START - if not token.startswith("f"): + if not token.startswith(("f", "F")): pos = endmatch.end(0) token = line[start:pos] yield (STRING, token, spos, epos, line) endprog_stack.pop() + parenlev = parenlev_stack.pop() else: end = endmatch.end(0) token = line[pos:end] @@ -749,6 +755,7 @@ def generate_tokens( ) fstring_level -= 1 endprog_stack.pop() + parenlev = parenlev_stack.pop() else: fstring_middle, lbrace = token[:-1], token[-1] fstring_middle_epos = lbrace_spos = (lnum, end - 1) @@ -765,7 +772,7 @@ def generate_tokens( else: # multiple lines # TODO: normalize fstring detection - if token.startswith("f"): + if token.startswith(("f", "F")): strstart = (lnum, pos) contstr = line[pos:] else: @@ -787,6 +794,8 @@ def generate_tokens( endprog = maybe_endprog if token[-1] == "\n": # continued string endprog_stack.append(endprog) + parenlev_stack.append(parenlev) + parenlev = 0 strstart = (lnum, start) contstr, needcont = line[start:], 1 contline = line @@ -797,7 +806,7 @@ def generate_tokens( stashed = None # TODO: move this logic to a function - if not token.startswith("f"): + if not token.startswith(("f", "F")): yield (STRING, token, spos, epos, line) else: if pseudomatch[20] is not None: @@ -820,6 +829,8 @@ def generate_tokens( fstring_level += 1 endprog = endprogs[fstring_start] endprog_stack.append(endprog) + parenlev_stack.append(parenlev) + parenlev = 0 end_offset = pseudomatch.end(1) - 1 fstring_middle = line[start + offset : end_offset] @@ -838,6 +849,7 @@ def generate_tokens( yield (FSTRING_END, token[-1], end_spos, end_epos, line) fstring_level -= 1 endprog_stack.pop() + parenlev = parenlev_stack.pop() else: end_spos = (lnum, end_offset) end_epos = (lnum, end_offset + 1) From c0a99c86530102b88079e84498a82c2c4b3f567e Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 00:17:40 +0530 Subject: [PATCH 25/77] fix some bugs that I introduced just now --- src/blib2to3/pgen2/tokenize.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index ab846f0ae0d..7417dfd7f6b 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -547,6 +547,10 @@ def generate_tokens( line, ) fstring_level -= 1 + endprog_stack.pop() + parenlev = parenlev_stack.pop() + if fstring_level > 0: + inside_fstring_braces = True pos = end contstr, needcont = "", 0 contline = None @@ -670,6 +674,8 @@ def generate_tokens( fstring_level -= 1 endprog_stack.pop() parenlev = parenlev_stack.pop() + if fstring_level > 0: + inside_fstring_braces = True else: yield (LBRACE, "{", (lnum, end-1), (lnum, end), line) inside_fstring_braces = True @@ -756,6 +762,8 @@ def generate_tokens( fstring_level -= 1 endprog_stack.pop() parenlev = parenlev_stack.pop() + if fstring_level > 0: + inside_fstring_braces = True else: fstring_middle, lbrace = token[:-1], token[-1] fstring_middle_epos = lbrace_spos = (lnum, end - 1) @@ -850,6 +858,8 @@ def generate_tokens( fstring_level -= 1 endprog_stack.pop() parenlev = parenlev_stack.pop() + if fstring_level > 0: + inside_fstring_braces = True else: end_spos = (lnum, end_offset) end_epos = (lnum, end_offset + 1) From b02cf2a2f47ab218f1f1a4ff57d59b526b7d63e7 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 03:31:53 +0530 Subject: [PATCH 26/77] strings and fstrings can have implicit concat --- src/blib2to3/Grammar.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blib2to3/Grammar.txt b/src/blib2to3/Grammar.txt index 0b369ddd9ff..54395423c1e 100644 --- a/src/blib2to3/Grammar.txt +++ b/src/blib2to3/Grammar.txt @@ -166,7 +166,7 @@ atom: ('(' [yield_expr|testlist_gexp] ')' | '[' [listmaker] ']' | '{' [dictsetmaker] '}' | '`' testlist1 '`' | - NAME | NUMBER | STRING+ | fstring+ | '.' '.' '.') + NAME | NUMBER | (STRING | fstring)+ | '.' '.' '.') listmaker: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] ) testlist_gexp: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] ) lambdef: 'lambda' [varargslist] ':' test From acd3c79918c0a8719cfb6eb9fae054af83a3c2e4 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 03:32:12 +0530 Subject: [PATCH 27/77] don't normalize docstring prefixes --- src/black/linegen.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 1d4e701ebbb..a4dc7825a77 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -489,13 +489,14 @@ def visit_fstring(self, node: Node) -> Iterator[Line]: quote_char = fstring_end.value[0] quote_idx = fstring_start.value.index(quote_char) prefix, quote = fstring_start.value[:quote_idx], fstring_start.value[quote_idx:] - assert 'f' in prefix or 'F' in prefix + + if not is_docstring(node): + prefix = normalize_string_prefix(prefix) + assert quote == fstring_end.value is_raw_fstring = 'r' in prefix or 'R' in prefix middles = [node for node in node.children if node.type == token.FSTRING_MIDDLE] - # if ''.join(m.value for m in middles) == 'foo': - # breakpoint() if self.mode.string_normalization: middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring) From 5bca062047fde535bb63d545653e1e1dd92dd52b Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 04:02:01 +0530 Subject: [PATCH 28/77] Add !r format specifier support --- src/black/__init__.py | 1 + src/black/nodes.py | 5 +++++ src/blib2to3/Grammar.txt | 3 ++- src/blib2to3/pgen2/grammar.py | 1 + src/blib2to3/pgen2/token.py | 3 ++- src/blib2to3/pgen2/tokenize.py | 11 +++++++++++ 6 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index 6fc91d2e6d3..425a2f9b3d3 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -1184,6 +1184,7 @@ def get_features_used( # noqa: C901 if is_string_token(n): value_head = n.value[:2] if value_head in {'f"', 'F"', "f'", "F'", "rf", "fr", "RF", "FR"}: + # TODO: this will need tweaking features.add(Feature.F_STRINGS) if Feature.DEBUG_F_STRINGS not in features: for span_beg, span_end in iter_fexpr_spans(n.value): diff --git a/src/black/nodes.py b/src/black/nodes.py index 3f9f0a9be2e..8abbf00b5bc 100644 --- a/src/black/nodes.py +++ b/src/black/nodes.py @@ -136,6 +136,7 @@ STANDALONE_COMMENT, token.FSTRING_MIDDLE, token.FSTRING_END, + token.BANG, } RARROW = 55 @@ -266,6 +267,9 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool) -> str: # noqa: C901 elif prev.type in OPENING_BRACKETS: return NO + elif prev.type == token.BANG: + return NO + if p.type in {syms.parameters, syms.arglist}: # untyped function signatures or calls if not prev or prev.type != token.COMMA: @@ -384,6 +388,7 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool) -> str: # noqa: C901 elif prevp.type == token.EQUAL and prevp_parent.type == syms.argument: return NO + # TODO: add fstring here? elif t in {token.NAME, token.NUMBER, token.STRING}: return NO diff --git a/src/blib2to3/Grammar.txt b/src/blib2to3/Grammar.txt index 54395423c1e..4e78145cfcb 100644 --- a/src/blib2to3/Grammar.txt +++ b/src/blib2to3/Grammar.txt @@ -260,4 +260,5 @@ pattern: (expr|star_expr) ['as' expr] fstring: FSTRING_START fstring_middle* FSTRING_END fstring_middle: fstring_replacement_field | FSTRING_MIDDLE -fstring_replacement_field: '{' (yield_expr | testlist_star_expr) '}' +fstring_replacement_field: '{' (yield_expr | testlist_star_expr) [ "!" NAME ] [ ':' fstring_format_spec* ] '}' +fstring_format_spec: FSTRING_MIDDLE | fstring_replacement_field diff --git a/src/blib2to3/pgen2/grammar.py b/src/blib2to3/pgen2/grammar.py index 1f3fdc55b97..804db1ad985 100644 --- a/src/blib2to3/pgen2/grammar.py +++ b/src/blib2to3/pgen2/grammar.py @@ -218,6 +218,7 @@ def report(self) -> None: //= DOUBLESLASHEQUAL -> RARROW := COLONEQUAL +! BANG """ opmap = {} diff --git a/src/blib2to3/pgen2/token.py b/src/blib2to3/pgen2/token.py index 761cc1c7e88..3068c3157fc 100644 --- a/src/blib2to3/pgen2/token.py +++ b/src/blib2to3/pgen2/token.py @@ -69,7 +69,8 @@ FSTRING_START: Final = 60 FSTRING_MIDDLE: Final = 61 FSTRING_END: Final = 62 -N_TOKENS: Final = 63 +BANG: Final = 63 +N_TOKENS: Final = 64 NT_OFFSET: Final = 256 # --end constants-- diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 7417dfd7f6b..1f8514d977d 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -141,6 +141,10 @@ def _combinations(*l: str) -> Set[str]: Single3Lbrace = r"[^'\\{]*(?:(?:\\.|{{|'(?!''))[^'\\{]*)*{(?!{)" Double3Lbrace = r'[^"\\{]*(?:(?:\\.|{{|"(?!""))[^"\\{]*)*{(?!{)' +# ! format specifier inside an fstring brace +Bang = Whitespace + group("!") +bang = re.compile(Bang) + # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get # recognized as two instances of =). @@ -687,6 +691,13 @@ def generate_tokens( contline = line break + if fstring_level > 0 and inside_fstring_braces: + match = bang.match(line, pos) + if match: + start, end = match.span(1) + yield (OP, "!", (lnum, start), (lnum, end), line) + pos = end + pseudomatch = pseudoprog.match(line, pos) if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) From b755281b74f13089aafdeb98148417b6e4d00673 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 13:12:07 +0530 Subject: [PATCH 29/77] Support non nested format specifiers --- src/blib2to3/Grammar.txt | 2 +- src/blib2to3/pgen2/tokenize.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/blib2to3/Grammar.txt b/src/blib2to3/Grammar.txt index 4e78145cfcb..43b23c51453 100644 --- a/src/blib2to3/Grammar.txt +++ b/src/blib2to3/Grammar.txt @@ -260,5 +260,5 @@ pattern: (expr|star_expr) ['as' expr] fstring: FSTRING_START fstring_middle* FSTRING_END fstring_middle: fstring_replacement_field | FSTRING_MIDDLE -fstring_replacement_field: '{' (yield_expr | testlist_star_expr) [ "!" NAME ] [ ':' fstring_format_spec* ] '}' +fstring_replacement_field: '{' (yield_expr | testlist_star_expr) ['='] [ "!" NAME ] [ ':' fstring_format_spec ] '}' fstring_format_spec: FSTRING_MIDDLE | fstring_replacement_field diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 1f8514d977d..8b96c4bee77 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -144,6 +144,11 @@ def _combinations(*l: str) -> Set[str]: # ! format specifier inside an fstring brace Bang = Whitespace + group("!") bang = re.compile(Bang) +Colon = Whitespace + group(":") +colon = re.compile(Colon) + +FstringMiddleAfterColon = Whitespace + group(r".+?") + group("{", "}") +fstring_middle_after_colon = re.compile(FstringMiddleAfterColon) # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get @@ -477,6 +482,7 @@ def generate_tokens( lnum = parenlev = fstring_level = continued = 0 parenlev_stack = [] inside_fstring_braces = False + inside_fstring_colon = False numchars: Final[str] = "0123456789" contstr, needcont = "", 0 contline: Optional[str] = None @@ -681,7 +687,7 @@ def generate_tokens( if fstring_level > 0: inside_fstring_braces = True else: - yield (LBRACE, "{", (lnum, end-1), (lnum, end), line) + yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line) inside_fstring_braces = True pos = end continue @@ -691,12 +697,37 @@ def generate_tokens( contline = line break + # TODO: fstring_level > 0 is redundant in both cases here, + # remove it and ensure nothing breaks + if fstring_level > 0 and inside_fstring_colon: + match = fstring_middle_after_colon.match(line, pos) + if match is None: + raise TokenError("unterminated f-string literal", (lnum, pos)) + + start, end = match.span(1) + token = line[start:end] + yield (FSTRING_MIDDLE, token, (lnum, start), (lnum, end), line) + inside_fstring_colon = False + pos = end + continue + if fstring_level > 0 and inside_fstring_braces: match = bang.match(line, pos) if match: start, end = match.span(1) yield (OP, "!", (lnum, start), (lnum, end), line) pos = end + continue + + match = colon.match(line, pos) + if match: + start, end = match.span(1) + yield (OP, ":", (lnum, start), (lnum, end), line) + inside_fstring_colon = True + pos = end + continue + + # TODO: `=` is left, eg. f"{abc = }" pseudomatch = pseudoprog.match(line, pos) if pseudomatch: # scan for tokens From 8f7ecdfd6f044e0f8fdf97b4cad6cf8acba2be1c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 24 Sep 2023 07:42:32 +0000 Subject: [PATCH 30/77] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/blib2to3/pgen2/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 8b96c4bee77..e8616b912bf 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -703,7 +703,7 @@ def generate_tokens( match = fstring_middle_after_colon.match(line, pos) if match is None: raise TokenError("unterminated f-string literal", (lnum, pos)) - + start, end = match.span(1) token = line[start:end] yield (FSTRING_MIDDLE, token, (lnum, start), (lnum, end), line) From 00dc7ac025751eff1e1d6612387af7ead4b0b8c4 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 13:19:35 +0530 Subject: [PATCH 31/77] fix walrus edge case --- src/blib2to3/pgen2/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index e8616b912bf..8cdfdc2a823 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -711,7 +711,7 @@ def generate_tokens( pos = end continue - if fstring_level > 0 and inside_fstring_braces: + if fstring_level > 0 and parenlev == 0 and inside_fstring_braces: match = bang.match(line, pos) if match: start, end = match.span(1) From 306b9e926985bd61df09169c8386ed46deb4a2af Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 13:39:30 +0530 Subject: [PATCH 32/77] empty FSTRING_MIDDLE should not be truncated --- src/black/lines.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/black/lines.py b/src/black/lines.py index 0a307b45eff..ea8572e6c0f 100644 --- a/src/black/lines.py +++ b/src/black/lines.py @@ -71,7 +71,12 @@ def append( Inline comments are put aside. """ - has_value = leaf.type in BRACKETS or bool(leaf.value.strip()) + has_value = ( + leaf.type in BRACKETS + # empty fstring-middles must not be truncated + or leaf.type == token.FSTRING_MIDDLE + or bool(leaf.value.strip()) + ) if not has_value: return From 7323840a35d39b2bc9d7d9fa9599dc443a74c8b2 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 14:04:54 +0530 Subject: [PATCH 33/77] support rf" tokens --- src/black/strings.py | 4 ++-- src/blib2to3/pgen2/tokenize.py | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/black/strings.py b/src/black/strings.py index 1300055ccbc..b346e538766 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -272,8 +272,8 @@ def normalize_fstring_quotes( # so converting is impossible return middles, quote - # Do not introduce or remove backslashes in raw strings - return middles, new_quote + # Do not introduce or remove backslashes in raw strings, just use double quote + return middles, '"' new_segments = [] for middle in middles: diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 8cdfdc2a823..b8d6e28c86b 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -228,6 +228,12 @@ def _combinations(*l: str) -> Set[str]: | {f"{prefix}'" for prefix in _strprefixes | _fstring_prefixes} | {f'{prefix}"' for prefix in _strprefixes | _fstring_prefixes} ) +fstring_prefix: Final = ( + {f"{prefix}'" for prefix in _fstring_prefixes} + | {f'{prefix}"' for prefix in _fstring_prefixes} + | {f"{prefix}'''" for prefix in _fstring_prefixes} + | {f'{prefix}"""' for prefix in _fstring_prefixes} +) tabsize = 8 @@ -461,6 +467,15 @@ def untokenize(iterable: Iterable[TokenInfo]) -> str: return ut.untokenize(iterable) +def is_fstring_start(token: str) -> bool: + # TODO: builtins.any is shadowed :( + for prefix in fstring_prefix: + if token.startswith(prefix): + return True + + return False + + def generate_tokens( readline: Callable[[], str], grammar: Optional[Grammar] = None ) -> Iterator[GoodTokenInfo]: @@ -761,7 +776,7 @@ def generate_tokens( endprog_stack.append(endprog) parenlev_stack.append(parenlev) parenlev = 0 - if token.startswith(("f", "F")): + if is_fstring_start(token): yield (FSTRING_START, token, spos, epos, line) fstring_level += 1 @@ -771,8 +786,7 @@ def generate_tokens( yield stashed stashed = None # TODO: move this logic to a function - # TODO: not how you should identify FSTRING_START - if not token.startswith(("f", "F")): + if not is_fstring_start(token): pos = endmatch.end(0) token = line[start:pos] yield (STRING, token, spos, epos, line) @@ -821,8 +835,7 @@ def generate_tokens( pos = end else: # multiple lines - # TODO: normalize fstring detection - if token.startswith(("f", "F")): + if is_fstring_start(token): strstart = (lnum, pos) contstr = line[pos:] else: @@ -855,8 +868,7 @@ def generate_tokens( yield stashed stashed = None - # TODO: move this logic to a function - if not token.startswith(("f", "F")): + if not is_fstring_start(token): yield (STRING, token, spos, epos, line) else: if pseudomatch[20] is not None: From 4fc656d6a13ac5b4a50e2267b4260f6451913950 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 24 Sep 2023 14:35:17 +0530 Subject: [PATCH 34/77] fix fstring feature detection --- src/black/__init__.py | 16 ++++++---------- tests/test_black.py | 3 +-- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index 425a2f9b3d3..01ebfaebeaa 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -1181,16 +1181,12 @@ def get_features_used( # noqa: C901 } for n in node.pre_order(): - if is_string_token(n): - value_head = n.value[:2] - if value_head in {'f"', 'F"', "f'", "F'", "rf", "fr", "RF", "FR"}: - # TODO: this will need tweaking - features.add(Feature.F_STRINGS) - if Feature.DEBUG_F_STRINGS not in features: - for span_beg, span_end in iter_fexpr_spans(n.value): - if n.value[span_beg : span_end - 1].rstrip().endswith("="): - features.add(Feature.DEBUG_F_STRINGS) - break + if n.type == token.FSTRING_START: + features.add(Feature.F_STRINGS) + elif n.type == token.RBRACE and any( + child.type == token.EQUAL for child in n.parent.children + ): + features.add(Feature.DEBUG_F_STRINGS) elif is_number_token(n): if "_" in n.value: diff --git a/tests/test_black.py b/tests/test_black.py index 79930fabf1f..c0ab06af9c0 100644 --- a/tests/test_black.py +++ b/tests/test_black.py @@ -360,12 +360,11 @@ def test_detect_debug_f_strings(self) -> None: features = black.get_features_used(root) self.assertNotIn(black.Feature.DEBUG_F_STRINGS, features) - # We don't yet support feature version detection in nested f-strings root = black.lib2to3_parse( """f"heard a rumour that { f'{1+1=}' } ... seems like it could be true" """ ) features = black.get_features_used(root) - self.assertNotIn(black.Feature.DEBUG_F_STRINGS, features) + self.assertIn(black.Feature.DEBUG_F_STRINGS, features) @patch("black.dump_to_file", dump_to_stderr) def test_string_quotes(self) -> None: From ea70516b2e17e33fb75c222ba6d3b005c2d70b84 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sat, 30 Sep 2023 18:23:35 +0530 Subject: [PATCH 35/77] fix edge cases in format specifier tokenizing --- src/blib2to3/pgen2/tokenize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index b8d6e28c86b..473bd8434b0 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -147,7 +147,7 @@ def _combinations(*l: str) -> Set[str]: Colon = Whitespace + group(":") colon = re.compile(Colon) -FstringMiddleAfterColon = Whitespace + group(r".+?") + group("{", "}") +FstringMiddleAfterColon = Whitespace + group(r".*?") + group("{", "}") fstring_middle_after_colon = re.compile(FstringMiddleAfterColon) # Because of leftmost-then-longest match semantics, be sure to put the @@ -726,7 +726,7 @@ def generate_tokens( pos = end continue - if fstring_level > 0 and parenlev == 0 and inside_fstring_braces: + if fstring_level > 0 and inside_fstring_braces: match = bang.match(line, pos) if match: start, end = match.span(1) From 4b80fe1b289e2355e59d9f1339c3f4478edb95fc Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 2 Oct 2023 01:31:50 +0530 Subject: [PATCH 36/77] fix that one bug with depending on parenlev --- src/blib2to3/Grammar.txt | 3 ++- src/blib2to3/pgen2/tokenize.py | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/blib2to3/Grammar.txt b/src/blib2to3/Grammar.txt index 43b23c51453..5829bb55bc9 100644 --- a/src/blib2to3/Grammar.txt +++ b/src/blib2to3/Grammar.txt @@ -259,6 +259,7 @@ patterns: pattern (',' pattern)* [','] pattern: (expr|star_expr) ['as' expr] fstring: FSTRING_START fstring_middle* FSTRING_END +# TODO making these FSTRING_MIDDLE makes them unformattable so maybe put a new token here? fstring_middle: fstring_replacement_field | FSTRING_MIDDLE -fstring_replacement_field: '{' (yield_expr | testlist_star_expr) ['='] [ "!" NAME ] [ ':' fstring_format_spec ] '}' +fstring_replacement_field: '{' (yield_expr | testlist_star_expr) ['='] [ "!" NAME ] [ ':' fstring_format_spec* ] '}' fstring_format_spec: FSTRING_MIDDLE | fstring_replacement_field diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 473bd8434b0..d636875785e 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -498,6 +498,7 @@ def generate_tokens( parenlev_stack = [] inside_fstring_braces = False inside_fstring_colon = False + bracelev = 0 numchars: Final[str] = "0123456789" contstr, needcont = "", 0 contline: Optional[str] = None @@ -722,11 +723,19 @@ def generate_tokens( start, end = match.span(1) token = line[start:end] yield (FSTRING_MIDDLE, token, (lnum, start), (lnum, end), line) + + brace_start, brace_end = match.span(2) + brace = line[brace_start:brace_end] + if brace == '{': + yield (OP, brace, (lnum, brace_start), (lnum, brace_end), line) + bracelev += 1 + end = brace_end + inside_fstring_colon = False pos = end continue - if fstring_level > 0 and inside_fstring_braces: + if fstring_level > 0 and parenlev == 0 and inside_fstring_braces: match = bang.match(line, pos) if match: start, end = match.span(1) @@ -964,11 +973,13 @@ def generate_tokens( stashed = None yield (NL, token, spos, (lnum, pos), line) continued = 1 - elif initial == "}" and parenlev == 0 and fstring_level > 0: + elif initial == "}" and parenlev == 0 and bracelev == 0 and fstring_level > 0: yield (RBRACE, token, spos, epos, line) inside_fstring_braces = False else: - if initial in "([{": + if parenlev == 0 and bracelev > 0 and initial == '}': + bracelev -= 1 + elif initial in "([{": parenlev += 1 elif initial in ")]}": parenlev -= 1 @@ -987,6 +998,8 @@ def generate_tokens( for _indent in indents[1:]: # pop remaining indent levels yield (DEDENT, "", (lnum, 0), (lnum, 0), "") yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") + assert len(endprog_stack) == 0 + assert len(parenlev_stack) == 0 if __name__ == "__main__": # testing From 420867d51246a60e6ba15aa8146a85f0bc8730d4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 1 Oct 2023 20:02:11 +0000 Subject: [PATCH 37/77] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/blib2to3/pgen2/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index d636875785e..2a99c335124 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -723,7 +723,7 @@ def generate_tokens( start, end = match.span(1) token = line[start:end] yield (FSTRING_MIDDLE, token, (lnum, start), (lnum, end), line) - + brace_start, brace_end = match.span(2) brace = line[brace_start:brace_end] if brace == '{': From 160ef4ef977ba547f9242c8c8e9f0dad0e3bd319 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 2 Oct 2023 19:03:39 +0530 Subject: [PATCH 38/77] fix line location for triple quoted strings --- src/blib2to3/pgen2/tokenize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 2a99c335124..043855d0083 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -798,6 +798,7 @@ def generate_tokens( if not is_fstring_start(token): pos = endmatch.end(0) token = line[start:pos] + epos = (lnum, pos) yield (STRING, token, spos, epos, line) endprog_stack.pop() parenlev = parenlev_stack.pop() From edf3d795d089aff0d2335287fa7b458c446d434f Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 2 Oct 2023 19:18:17 +0530 Subject: [PATCH 39/77] try to fix mypy errors --- src/black/__init__.py | 2 +- src/black/linegen.py | 6 ++++-- src/black/nodes.py | 6 +++--- src/black/strings.py | 6 +++--- src/blib2to3/pgen2/tokenize.py | 4 ++-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index 01ebfaebeaa..053592bb31f 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -1183,7 +1183,7 @@ def get_features_used( # noqa: C901 for n in node.pre_order(): if n.type == token.FSTRING_START: features.add(Feature.F_STRINGS) - elif n.type == token.RBRACE and any( + elif n.type == token.RBRACE and n.parent is not None and any( child.type == token.EQUAL for child in n.parent.children ): features.add(Feature.DEBUG_F_STRINGS) diff --git a/src/black/linegen.py b/src/black/linegen.py index a4dc7825a77..0aeec8b8dcb 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -481,10 +481,12 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: yield from self.visit_default(leaf) - def visit_fstring(self, node: Node) -> Iterator[Line]: + def visit_fstring(self, node: Leaf) -> Iterator[Line]: """Bunch of hacks here. Needs improvement.""" fstring_start = node.children[0] fstring_end = node.children[-1] + assert isinstance(fstring_start, Leaf) + assert isinstance(fstring_end, Leaf) quote_char = fstring_end.value[0] quote_idx = fstring_start.value.index(quote_char) @@ -496,7 +498,7 @@ def visit_fstring(self, node: Node) -> Iterator[Line]: assert quote == fstring_end.value is_raw_fstring = 'r' in prefix or 'R' in prefix - middles = [node for node in node.children if node.type == token.FSTRING_MIDDLE] + middles = [node for node in node.leaves() if node.type == token.FSTRING_MIDDLE] if self.mode.string_normalization: middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring) diff --git a/src/black/nodes.py b/src/black/nodes.py index 8abbf00b5bc..2723eb8baae 100644 --- a/src/black/nodes.py +++ b/src/black/nodes.py @@ -531,14 +531,14 @@ def is_arith_like(node: LN) -> bool: } -def is_docstring(leaf: Leaf) -> bool: +def is_docstring(node: NL) -> bool: if prev_siblings_are( - leaf.parent, [None, token.NEWLINE, token.INDENT, syms.simple_stmt] + node.parent, [None, token.NEWLINE, token.INDENT, syms.simple_stmt] ): return True # Multiline docstring on the same line as the `def`. - if prev_siblings_are(leaf.parent, [syms.parameters, token.COLON, syms.simple_stmt]): + if prev_siblings_are(node.parent, [syms.parameters, token.COLON, syms.simple_stmt]): # `syms.parameters` is only used in funcdefs and async_funcdefs in the Python # grammar. We're safe to return True without further checks. return True diff --git a/src/black/strings.py b/src/black/strings.py index b346e538766..7d8ba9b446d 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -5,7 +5,7 @@ import re import sys from functools import lru_cache -from typing import Final, List, Match, Pattern +from typing import Final, List, Match, Pattern, Tuple from black._width_table import WIDTH_TABLE from blib2to3.pytree import Leaf @@ -245,9 +245,9 @@ def normalize_string_quotes(s: str) -> str: def normalize_fstring_quotes( quote: str, - middles: list[str], + middles: List[Leaf], is_raw_fstring: bool -) -> tuple[str, str]: +) -> Tuple[List[Leaf], str]: """Prefer double quotes but only if it doesn't cause more escaping. Adds or removes backslashes as appropriate. diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 043855d0083..c21b1223a80 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -495,7 +495,7 @@ def generate_tokens( logical line; continuation lines are included. """ lnum = parenlev = fstring_level = continued = 0 - parenlev_stack = [] + parenlev_stack: List[int] = [] inside_fstring_braces = False inside_fstring_colon = False bracelev = 0 @@ -514,7 +514,7 @@ def generate_tokens( async_def_nl = False strstart: Tuple[int, int] - endprog_stack: list[Pattern[str]] = [] + endprog_stack: List[Pattern[str]] = [] while 1: # loop over lines in stream try: From 23bee77e43caf30d415677c2ee30d7e814ef6571 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 2 Oct 2023 19:20:05 +0530 Subject: [PATCH 40/77] commit unstaged change --- src/black/linegen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 0aeec8b8dcb..7ace29766f5 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -481,7 +481,7 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: yield from self.visit_default(leaf) - def visit_fstring(self, node: Leaf) -> Iterator[Line]: + def visit_fstring(self, node: Node) -> Iterator[Line]: """Bunch of hacks here. Needs improvement.""" fstring_start = node.children[0] fstring_end = node.children[-1] From 6997d1474e075959a66f6d7c07abb02103d1b7be Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 2 Oct 2023 22:08:02 +0530 Subject: [PATCH 41/77] Add `fstring_format_spec` to symbols --- src/blib2to3/pygram.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/blib2to3/pygram.py b/src/blib2to3/pygram.py index 8c93e4ddb13..af0bd0035ca 100644 --- a/src/blib2to3/pygram.py +++ b/src/blib2to3/pygram.py @@ -72,6 +72,7 @@ class _python_symbols(Symbols): flow_stmt: int for_stmt: int fstring: int + fstring_format_spec: int fstring_middle: int fstring_replacement_field: int funcdef: int From 4e201fc0c89e832110a0fc676ea8cfb38bd7f8bf Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 2 Oct 2023 22:33:07 +0530 Subject: [PATCH 42/77] fix possible cause of mypyc crash --- src/black/linegen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 7ace29766f5..c31b1c94543 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -498,7 +498,7 @@ def visit_fstring(self, node: Node) -> Iterator[Line]: assert quote == fstring_end.value is_raw_fstring = 'r' in prefix or 'R' in prefix - middles = [node for node in node.leaves() if node.type == token.FSTRING_MIDDLE] + middles = [leaf for leaf in node.leaves() if leaf.type == token.FSTRING_MIDDLE] if self.mode.string_normalization: middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring) From 17a90630cca9ac4a1aec073f307f5093bec72259 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sat, 7 Oct 2023 01:54:39 +0530 Subject: [PATCH 43/77] Fix edge case with wrapping format specs --- src/blib2to3/pgen2/tokenize.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index c21b1223a80..12266ccfa49 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -147,7 +147,7 @@ def _combinations(*l: str) -> Set[str]: Colon = Whitespace + group(":") colon = re.compile(Colon) -FstringMiddleAfterColon = Whitespace + group(r".*?") + group("{", "}") +FstringMiddleAfterColon = Whitespace + group(r".*?") + group("{", "}", "\n") fstring_middle_after_colon = re.compile(FstringMiddleAfterColon) # Because of leftmost-then-longest match semantics, be sure to put the @@ -498,6 +498,7 @@ def generate_tokens( parenlev_stack: List[int] = [] inside_fstring_braces = False inside_fstring_colon = False + formatspec = "" bracelev = 0 numchars: Final[str] = "0123456789" contstr, needcont = "", 0 @@ -515,6 +516,7 @@ def generate_tokens( strstart: Tuple[int, int] endprog_stack: List[Pattern[str]] = [] + formatspec_start: Tuple[int, int] while 1: # loop over lines in stream try: @@ -722,12 +724,21 @@ def generate_tokens( start, end = match.span(1) token = line[start:end] - yield (FSTRING_MIDDLE, token, (lnum, start), (lnum, end), line) + formatspec += token brace_start, brace_end = match.span(2) - brace = line[brace_start:brace_end] - if brace == '{': - yield (OP, brace, (lnum, brace_start), (lnum, brace_end), line) + brace_or_nl = line[brace_start:brace_end] + if brace_or_nl == "\n": + # TODO: in a triple quoted string we should infact add the \n here + # formatspec += "\n" + pos = brace_end + continue + + yield (FSTRING_MIDDLE, formatspec, formatspec_start, (lnum, end), line) + formatspec = "" + + if brace_or_nl == "{": + yield (OP, "{", (lnum, brace_start), (lnum, brace_end), line) bracelev += 1 end = brace_end @@ -748,6 +759,7 @@ def generate_tokens( start, end = match.span(1) yield (OP, ":", (lnum, start), (lnum, end), line) inside_fstring_colon = True + formatspec_start = (lnum, end) pos = end continue @@ -974,11 +986,16 @@ def generate_tokens( stashed = None yield (NL, token, spos, (lnum, pos), line) continued = 1 - elif initial == "}" and parenlev == 0 and bracelev == 0 and fstring_level > 0: + elif ( + initial == "}" + and parenlev == 0 + and bracelev == 0 + and fstring_level > 0 + ): yield (RBRACE, token, spos, epos, line) inside_fstring_braces = False else: - if parenlev == 0 and bracelev > 0 and initial == '}': + if parenlev == 0 and bracelev > 0 and initial == "}": bracelev -= 1 elif initial in "([{": parenlev += 1 From d0af0c14415be3adad3b7ea99d84d04d4f07e825 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 15 Oct 2023 00:11:20 +0530 Subject: [PATCH 44/77] Add FSTRING_PARSING as a feature --- src/black/__init__.py | 6 +++++- src/black/mode.py | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index b1416458ddb..7cb1e3bae05 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -1104,7 +1104,11 @@ def _format_str_once(src_contents: str, *, mode: Mode) -> str: elt = EmptyLineTracker(mode=mode) split_line_features = { feature - for feature in {Feature.TRAILING_COMMA_IN_CALL, Feature.TRAILING_COMMA_IN_DEF} + for feature in { + Feature.TRAILING_COMMA_IN_CALL, + Feature.TRAILING_COMMA_IN_DEF, + Feature.FSTRING_PARSING, + } if supports_feature(versions, feature) } block: Optional[LinesBlock] = None diff --git a/src/black/mode.py b/src/black/mode.py index 30c5d2f1b2f..be8db26eca6 100644 --- a/src/black/mode.py +++ b/src/black/mode.py @@ -47,6 +47,7 @@ class Feature(Enum): DEBUG_F_STRINGS = 16 PARENTHESIZED_CONTEXT_MANAGERS = 17 TYPE_PARAMS = 18 + FSTRING_PARSING = 19 FORCE_OPTIONAL_PARENTHESES = 50 # __future__ flags @@ -157,6 +158,7 @@ class Feature(Enum): Feature.EXCEPT_STAR, Feature.VARIADIC_GENERICS, Feature.TYPE_PARAMS, + Feature.FSTRING_PARSING, }, } From 78c1e9c23b769d94916dcd8f92bf16b943773c04 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 15 Oct 2023 19:08:34 +0530 Subject: [PATCH 45/77] Add test case --- tests/data/cases/pep_701.py | 157 ++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 tests/data/cases/pep_701.py diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py new file mode 100644 index 00000000000..af5520382af --- /dev/null +++ b/tests/data/cases/pep_701.py @@ -0,0 +1,157 @@ +# flags: --minimum-version=3.12 +x = f"foo" +x = f'foo' +x = f"""foo""" +x = f'''foo''' +x = f"foo {{ bar {{ baz" +x = f"foo {{ {2 + 2}bar {{ baz" +x = f'foo {{ {2 + 2}bar {{ baz' +x = f"""foo {{ {2 + 2}bar {{ baz""" +x = f'''foo {{ {2 + 2}bar {{ baz''' + +# edge case: FSTRING_MIDDLE containing only whitespace should not be stripped +x = f"{a} {b}" + +x = f"foo { + 2 + 2 +} bar baz" + +x = f"foo {{ {"a {2 + 2} b"}bar {{ baz" +x = f"foo {{ {f'a {2 + 2} b'}bar {{ baz" +x = f"foo {{ {f"a {2 + 2} b"}bar {{ baz" + +x = f"foo {{ {f'a {f"a {2 + 2} b"} b'}bar {{ baz" +x = f"foo {{ {f"a {f"a {2 + 2} b"} b"}bar {{ baz" + +x = """foo {{ {2 + 2}bar +baz""" + + +x = f"""foo {{ {2 + 2}bar {{ baz""" + +x = f"""foo {{ { + 2 + 2 +}bar {{ baz""" + + +x = f"""foo {{ { + 2 + 2 +}bar +baz""" + +x = f"""foo {{ a + foo {2 + 2}bar {{ baz + + x = f"foo {{ { + 2 + 2 # comment + }bar" + + {{ baz + + }} buzz + + {print("abc" + "def" +)} +abc""" + +# edge case: end triple quotes at index zero +f"""foo {2+2} bar +""" + +f' \' {f"'"} \' ' +f" \" {f'"'} \" " + +x = f"a{2+2:=^72}b" +x = f"a{2+2:x}b" + +rf'foo' +rf'{foo}' + +x = f"a{2+2:=^{x}}b" +x = f"a{2+2:=^{foo(x+y**2):something else}}b" +f'{(abc:=10)}' + +f"This is a really long string, but just make sure that you reflow fstrings { + 2+2:d +}" +f"This is a really long string, but just make sure that you reflow fstrings correctly {2+2:d}" + +# output + +x = f"foo" +x = f"foo" +x = f"""foo""" +x = f"""foo""" +x = f"foo {{ bar {{ baz" +x = f"foo {{ {2 + 2}bar {{ baz" +x = f"foo {{ {2 + 2}bar {{ baz" +x = f"""foo {{ {2 + 2}bar {{ baz""" +x = f"""foo {{ {2 + 2}bar {{ baz""" + +# edge case: FSTRING_MIDDLE containing only whitespace should not be stripped +x = f"{a} {b}" + +x = f"foo {2 + 2} bar baz" + +x = f"foo {{ {"a {2 + 2} b"}bar {{ baz" +x = f"foo {{ {f"a {2 + 2} b"}bar {{ baz" +x = f"foo {{ {f"a {2 + 2} b"}bar {{ baz" + +x = f"foo {{ {f"a {f"a {2 + 2} b"} b"}bar {{ baz" +x = f"foo {{ {f"a {f"a {2 + 2} b"} b"}bar {{ baz" + +x = """foo {{ {2 + 2}bar +baz""" + + +x = f"""foo {{ {2 + 2}bar {{ baz""" + +x = f"""foo {{ {2 + 2}bar {{ baz""" + + +x = f"""foo {{ { + 2 + 2 +}bar +baz""" + +x = f"""foo {{ a + foo { + 2 + 2 +}bar {{ baz + + x = f"foo {{ { + 2 + 2 # comment +}bar" + + {{ baz + + }} buzz + + { + print("abc" + "def") +} +abc""" + +# edge case: end triple quotes at index zero +f"""foo { + 2 + 2 +} bar +""" + +f" ' {f"'"} ' " +f' " {f'"'} " ' + +x = f"a{2 + 2:=^72}b" +x = f"a{2 + 2:x}b" + +rf"foo" +rf"{foo}" + +x = f"a{2 + 2:=^{x}}b" +x = f"a{2 + 2:=^{foo(x + y**2):something else}}b" +f"{(abc := 10)}" + +f"This is a really long string, but just make sure that you reflow fstrings {2 + 2:d}" +f"This is a really long string, but just make sure that you reflow fstrings correctly { + 2 + 2:d +}" From 6931c9205d557d14d6c30e014b4ab9cc0a630ca7 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 15 Oct 2023 20:40:39 +0530 Subject: [PATCH 46/77] Add two todos in test case --- tests/data/cases/pep_701.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index af5520382af..90eb13c35be 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -76,6 +76,18 @@ }" f"This is a really long string, but just make sure that you reflow fstrings correctly {2+2:d}" +# TODO: Edge case: if the fstring replacement ends with a `=` it should not be touched +# f"{2+2=}" +# f"{2+2 = }" +# f"{ 2 + 2 = }" + +# TODO: +# f"""foo { +# datetime.datetime.now():%Y +# %m +# %d +# }""" + # output x = f"foo" @@ -155,3 +167,15 @@ f"This is a really long string, but just make sure that you reflow fstrings correctly { 2 + 2:d }" + +# TODO: Edge case: if the fstring replacement ends with a `=` it should not be touched +# f"{2+2=}" +# f"{2+2 = }" +# f"{ 2 + 2 = }" + +# TODO: +# f"""foo { +# datetime.datetime.now():%Y +# %m +# %d +# }""" From 53ca71c034c5fbb7965fe4a6838fa596d4213339 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 16 Oct 2023 00:42:47 +0530 Subject: [PATCH 47/77] tiny changes --- src/black/linegen.py | 1 - src/blib2to3/pgen2/tokenize.py | 15 +++++---------- tests/data/cases/pep_701.py | 2 +- tests/util.py | 2 +- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index c3f9b3e2261..5f346a2f3cb 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -501,7 +501,6 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: yield from self.visit_default(leaf) def visit_fstring(self, node: Node) -> Iterator[Line]: - """Bunch of hacks here. Needs improvement.""" fstring_start = node.children[0] fstring_end = node.children[-1] assert isinstance(fstring_start, Leaf) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 12266ccfa49..d9985b6650f 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -538,7 +538,6 @@ def generate_tokens( spos = strstart epos = (lnum, end) tokenline = contline + line - # TODO: better way to detect fstring if fstring_level == 0: yield (STRING, token, spos, epos, tokenline) endprog_stack.pop() @@ -717,7 +716,7 @@ def generate_tokens( # TODO: fstring_level > 0 is redundant in both cases here, # remove it and ensure nothing breaks - if fstring_level > 0 and inside_fstring_colon: + if inside_fstring_colon: match = fstring_middle_after_colon.match(line, pos) if match is None: raise TokenError("unterminated f-string literal", (lnum, pos)) @@ -746,7 +745,7 @@ def generate_tokens( pos = end continue - if fstring_level > 0 and parenlev == 0 and inside_fstring_braces: + if inside_fstring_braces and parenlev == 0: match = bang.match(line, pos) if match: start, end = match.span(1) @@ -818,8 +817,6 @@ def generate_tokens( end = endmatch.end(0) token = line[pos:end] spos, epos = (lnum, pos), (lnum, end) - # TODO: confirm there will be no padding around the tokens - # TODO: don't detect like this perhaps? if not token.endswith("{"): fstring_middle, fstring_end = token[:-3], token[-3:] fstring_middle_epos = fstring_end_spos = (lnum, end - 3) @@ -885,7 +882,7 @@ def generate_tokens( contstr, needcont = line[start:], 1 contline = line break - else: # single line string + else: # ordinary string if stashed: yield stashed stashed = None @@ -896,19 +893,17 @@ def generate_tokens( if pseudomatch[20] is not None: fstring_start = pseudomatch[20] offset = pseudomatch.end(20) - pseudomatch.start(1) - start_epos = (lnum, start + offset) elif pseudomatch[22] is not None: fstring_start = pseudomatch[22] offset = pseudomatch.end(22) - pseudomatch.start(1) - start_epos = (lnum, start + offset) elif pseudomatch[24] is not None: fstring_start = pseudomatch[24] offset = pseudomatch.end(24) - pseudomatch.start(1) - start_epos = (lnum, start + offset) else: fstring_start = pseudomatch[26] offset = pseudomatch.end(26) - pseudomatch.start(1) - start_epos = (lnum, start + offset) + + start_epos = (lnum, start + offset) yield (FSTRING_START, fstring_start, spos, start_epos, line) fstring_level += 1 endprog = endprogs[fstring_start] diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index 90eb13c35be..9f75836610f 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -1,4 +1,4 @@ -# flags: --minimum-version=3.12 +# flags: --target-version=py312 x = f"foo" x = f'foo' x = f"""foo""" diff --git a/tests/util.py b/tests/util.py index a31ae0992c2..3f4669c140f 100644 --- a/tests/util.py +++ b/tests/util.py @@ -214,7 +214,7 @@ def get_flags_parser() -> argparse.ArgumentParser: "--target-version", action="append", type=lambda val: TargetVersion[val.upper()], - default=(), + default=[], ) parser.add_argument("--line-length", default=DEFAULT_LINE_LENGTH, type=int) parser.add_argument( From e97dd01fba29523c176c40b8c55906ecc6ada990 Mon Sep 17 00:00:00 2001 From: hauntsaninja Date: Fri, 5 Jan 2024 22:38:10 -0800 Subject: [PATCH 48/77] fix merge --- src/black/linegen.py | 2 +- src/black/nodes.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 16c232ffd23..f477d2b12dc 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -518,7 +518,7 @@ def visit_fstring(self, node: Node) -> Iterator[Line]: quote_idx = fstring_start.value.index(quote_char) prefix, quote = fstring_start.value[:quote_idx], fstring_start.value[quote_idx:] - if not is_docstring(node): + if not is_docstring(node, self.mode): prefix = normalize_string_prefix(prefix) assert quote == fstring_end.value diff --git a/src/black/nodes.py b/src/black/nodes.py index 19d13816b77..71e9c3b3681 100644 --- a/src/black/nodes.py +++ b/src/black/nodes.py @@ -556,11 +556,11 @@ def is_docstring(node: NL, mode: Mode) -> bool: if ( Preview.unify_docstring_detection in mode - and leaf.parent - and leaf.parent.type == syms.simple_stmt - and not leaf.parent.prev_sibling - and leaf.parent.parent - and leaf.parent.parent.type == syms.file_input + and node.parent + and node.parent.type == syms.simple_stmt + and not node.parent.prev_sibling + and node.parent.parent + and node.parent.parent.type == syms.file_input ): return True From cf9b415461f2549c4eb0b56af66dc456bea88c39 Mon Sep 17 00:00:00 2001 From: Shantanu <12621235+hauntsaninja@users.noreply.github.com> Date: Fri, 5 Jan 2024 22:42:24 -0800 Subject: [PATCH 49/77] Update src/black/strings.py Co-authored-by: Jelle Zijlstra --- src/black/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/strings.py b/src/black/strings.py index b7107da2d9a..8cb10481bc9 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -291,7 +291,7 @@ def normalize_fstring_quotes( new_segments.append(new_segment) - if new_quote == '"""' and new_segments[-1][-1:] == '"': + if new_quote == '"""' and new_segments[-1].endswith('"'): # edge case: new_segments[-1] = new_segments[-1][:-1] + '\\"' From 9737159679a539e116d5688cca1f22954022c16d Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Mon, 12 Feb 2024 06:38:30 -0800 Subject: [PATCH 50/77] changelog --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index b1a6ae3bc1c..a5f48a62b9c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,8 @@ +- Add support for the new Python 3.12 f-string syntax introduced by PEP 701 (#3822) + ### Stable style From e220c10d5e25bd42a208abbf0d06581289491666 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Mon, 12 Feb 2024 06:40:01 -0800 Subject: [PATCH 51/77] Lint, remove unused function --- src/black/__init__.py | 2 -- src/black/nodes.py | 5 ----- src/black/strings.py | 2 +- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index 3e7de04cad1..a507528a4dc 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -73,7 +73,6 @@ STARS, is_number_token, is_simple_decorator_expression, - is_string_token, syms, ) from black.output import color_diff, diff, dump_to_file, err, ipynb_diff, out @@ -81,7 +80,6 @@ from black.parsing import lib2to3_parse, parse_ast, stringify_ast from black.ranges import adjusted_lines, convert_unchanged_lines, parse_line_ranges from black.report import Changed, NothingChanged, Report -from black.trans import iter_fexpr_spans from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node diff --git a/src/black/nodes.py b/src/black/nodes.py index 1a17dda2c6d..42051588d29 100644 --- a/src/black/nodes.py +++ b/src/black/nodes.py @@ -544,7 +544,6 @@ def is_arith_like(node: LN) -> bool: } - def is_docstring(node: NL, mode: Mode) -> bool: if isinstance(node, Leaf): if node.type != token.STRING: @@ -958,10 +957,6 @@ def is_rpar_token(nl: NL) -> TypeGuard[Leaf]: return nl.type == token.RPAR -def is_string_token(nl: NL) -> TypeGuard[Leaf]: - return nl.type == token.STRING - - def is_number_token(nl: NL) -> TypeGuard[Leaf]: return nl.type == token.NUMBER diff --git a/src/black/strings.py b/src/black/strings.py index 8cb10481bc9..517be9b9400 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -245,6 +245,7 @@ def normalize_string_quotes(s: str) -> str: return f"{prefix}{new_quote}{new_body}{new_quote}" + def normalize_fstring_quotes( quote: str, middles: List[Leaf], @@ -290,7 +291,6 @@ def normalize_fstring_quotes( new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment) new_segments.append(new_segment) - if new_quote == '"""' and new_segments[-1].endswith('"'): # edge case: new_segments[-1] = new_segments[-1][:-1] + '\\"' From 7ef92dbe8b3f5391b25ce8f4e7b71c9ea45829b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 14:41:24 +0000 Subject: [PATCH 52/77] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/black/__init__.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index a507528a4dc..f6602455f2c 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -69,12 +69,7 @@ from black.mode import FUTURE_FLAG_TO_FEATURE, VERSION_TO_FEATURES, Feature from black.mode import Mode as Mode # re-exported from black.mode import Preview, TargetVersion, supports_feature -from black.nodes import ( - STARS, - is_number_token, - is_simple_decorator_expression, - syms, -) +from black.nodes import STARS, is_number_token, is_simple_decorator_expression, syms from black.output import color_diff, diff, dump_to_file, err, ipynb_diff, out from black.parsing import InvalidInput # noqa F401 from black.parsing import lib2to3_parse, parse_ast, stringify_ast From 62e0b2b597747b882e1308e47dfff81230c57439 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 17 Mar 2024 23:43:56 +0530 Subject: [PATCH 53/77] fix debug visitor test --- src/blib2to3/pgen2/tokenize.py | 8 +- tests/data/miscellaneous/debug_visitor.out | 154 +++++++++++++++++++-- 2 files changed, 146 insertions(+), 16 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index d0843c557eb..64b547949c6 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -27,6 +27,7 @@ function to which the 5 fields described above are passed as 5 arguments, each time a new token is found.""" +import builtins import sys from typing import ( Callable, @@ -465,12 +466,7 @@ def untokenize(iterable: Iterable[TokenInfo]) -> str: def is_fstring_start(token: str) -> bool: - # TODO: builtins.any is shadowed :( - for prefix in fstring_prefix: - if token.startswith(prefix): - return True - - return False + return builtins.any(token.startswith(prefix) for prefix in fstring_prefix) def generate_tokens( diff --git a/tests/data/miscellaneous/debug_visitor.out b/tests/data/miscellaneous/debug_visitor.out index fa60010d421..24d7ed82472 100644 --- a/tests/data/miscellaneous/debug_visitor.out +++ b/tests/data/miscellaneous/debug_visitor.out @@ -229,8 +229,34 @@ file_input LPAR '(' arglist - STRING - "f'{indent}{_type}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + 'indent' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + '_type' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument @@ -370,8 +396,34 @@ file_input LPAR '(' arglist - STRING - "f'{indent}/{_type}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + 'indent' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '/' + fstring_replacement_field + LBRACE + '{' + NAME + '_type' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument @@ -494,8 +546,34 @@ file_input LPAR '(' arglist - STRING - "f'{indent}{_type}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + 'indent' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + fstring_replacement_field + LBRACE + '{' + NAME + '_type' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument @@ -557,8 +635,36 @@ file_input LPAR '(' arglist - STRING - "f' {node.prefix!r}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + ' ' + fstring_replacement_field + LBRACE + '{' + power + NAME + 'node' + trailer + DOT + '.' + NAME + 'prefix' + /trailer + /power + BANG + '!' + NAME + 'r' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument @@ -613,8 +719,36 @@ file_input LPAR '(' arglist - STRING - "f' {node.value!r}'" + fstring + FSTRING_START + "f'" + FSTRING_MIDDLE + ' ' + fstring_replacement_field + LBRACE + '{' + power + NAME + 'node' + trailer + DOT + '.' + NAME + 'value' + /trailer + /power + BANG + '!' + NAME + 'r' + RBRACE + '}' + /fstring_replacement_field + FSTRING_MIDDLE + '' + FSTRING_END + "'" + /fstring COMMA ',' argument From df38ea05adc7ff4d873fb7a0c1462b453c9aa1ae Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 25 Mar 2024 19:40:53 +0530 Subject: [PATCH 54/77] fix most tests --- src/black/__init__.py | 12 +++++++----- src/black/linegen.py | 14 +++++++++++++- src/black/strings.py | 2 +- tests/data/cases/pep_701.py | 11 +++++++++++ 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index f6602455f2c..ce2603a9b1d 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -1230,9 +1230,9 @@ def _format_str_once( future_imports = get_future_imports(src_node) versions = detect_target_versions(src_node, future_imports=future_imports) - context_manager_features = { + line_generator_features = { feature - for feature in {Feature.PARENTHESIZED_CONTEXT_MANAGERS} + for feature in {Feature.PARENTHESIZED_CONTEXT_MANAGERS, Feature.FSTRING_PARSING} if supports_feature(versions, feature) } normalize_fmt_off(src_node, mode, lines) @@ -1240,7 +1240,7 @@ def _format_str_once( # This should be called after normalize_fmt_off. convert_unchanged_lines(src_node, lines) - line_generator = LineGenerator(mode=mode, features=context_manager_features) + line_generator = LineGenerator(mode=mode, features=line_generator_features) elt = EmptyLineTracker(mode=mode) split_line_features = { feature @@ -1322,8 +1322,10 @@ def get_features_used( # noqa: C901 for n in node.pre_order(): if n.type == token.FSTRING_START: features.add(Feature.F_STRINGS) - elif n.type == token.RBRACE and n.parent is not None and any( - child.type == token.EQUAL for child in n.parent.children + elif ( + n.type == token.RBRACE + and n.parent is not None + and any(child.type == token.EQUAL for child in n.parent.children) ): features.add(Feature.DEBUG_F_STRINGS) diff --git a/src/black/linegen.py b/src/black/linegen.py index d94e779e827..58daf3904ab 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -502,6 +502,12 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: yield from self.visit_default(leaf) def visit_fstring(self, node: Node) -> Iterator[Line]: + if Feature.FSTRING_PARSING not in self.features: + string_leaf = _fstring_to_string(node) + node.replace(string_leaf) + yield from self.visit_default(string_leaf) + return + fstring_start = node.children[0] fstring_end = node.children[-1] assert isinstance(fstring_start, Leaf) @@ -516,7 +522,7 @@ def visit_fstring(self, node: Node) -> Iterator[Line]: assert quote == fstring_end.value - is_raw_fstring = 'r' in prefix or 'R' in prefix + is_raw_fstring = "r" in prefix or "R" in prefix middles = [leaf for leaf in node.leaves() if leaf.type == token.FSTRING_MIDDLE] if self.mode.string_normalization: @@ -560,6 +566,12 @@ def __post_init__(self) -> None: self.visit_guard = partial(v, keywords=Ø, parens={"if"}) +def _fstring_to_string(node: Node) -> Leaf: + """Converts an fstring node back to a string node.""" + string_without_prefix = str(node).removeprefix(node.prefix) + return Leaf(token.STRING, string_without_prefix, prefix=node.prefix) + + def _hugging_power_ops_line_to_string( line: Line, features: Collection[Feature], diff --git a/src/black/strings.py b/src/black/strings.py index 517be9b9400..a505ef67171 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -249,7 +249,7 @@ def normalize_string_quotes(s: str) -> str: def normalize_fstring_quotes( quote: str, middles: List[Leaf], - is_raw_fstring: bool + is_raw_fstring: bool, ) -> Tuple[List[Leaf], str]: """Prefer double quotes but only if it doesn't cause more escaping. diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index 9f75836610f..f7efbcaa954 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -88,6 +88,11 @@ # %d # }""" +raise ValueError( + "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" + f" {lines_str!r}" + ) + # output x = f"foo" @@ -179,3 +184,9 @@ # %m # %d # }""" + +raise ValueError( + "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" f" { + lines_str!r + }" +) From 150a4fee086dd2087268a41c58d1c24295c071e0 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 25 Mar 2024 21:35:09 +0530 Subject: [PATCH 55/77] fix whitespace getting removed after fstring colon --- src/blib2to3/pgen2/tokenize.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 64b547949c6..aa2e3db3daf 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -141,13 +141,13 @@ def _combinations(*l: str) -> Set[str]: Single3Lbrace = r"[^'\\{]*(?:(?:\\.|{{|'(?!''))[^'\\{]*)*{(?!{)" Double3Lbrace = r'[^"\\{]*(?:(?:\\.|{{|"(?!""))[^"\\{]*)*{(?!{)' -# ! format specifier inside an fstring brace -Bang = Whitespace + group("!") +# ! format specifier inside an fstring brace, ensure it's not a `!=` token +Bang = Whitespace + group("!") + r'(?!=)' bang = re.compile(Bang) Colon = Whitespace + group(":") colon = re.compile(Colon) -FstringMiddleAfterColon = Whitespace + group(r".*?") + group("{", "}", "\n") +FstringMiddleAfterColon = group(Whitespace + r".*?") + group("{", "}", "\n") fstring_middle_after_colon = re.compile(FstringMiddleAfterColon) # Because of leftmost-then-longest match semantics, be sure to put the @@ -755,8 +755,6 @@ def generate_tokens( pos = end continue - # TODO: `=` is left, eg. f"{abc = }" - pseudomatch = pseudoprog.match(line, pos) if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) From c4487cb50ff48758eb48810f7209e7d1c54c0be6 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 25 Mar 2024 23:11:19 +0530 Subject: [PATCH 56/77] remove unnecessary continue --- src/blib2to3/pgen2/tokenize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index aa2e3db3daf..6e92b9ed22c 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -724,7 +724,6 @@ def generate_tokens( # TODO: in a triple quoted string we should infact add the \n here # formatspec += "\n" pos = brace_end - continue yield (FSTRING_MIDDLE, formatspec, formatspec_start, (lnum, end), line) formatspec = "" From ece7452535ea5247e48e8313419985796a09e198 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 25 Mar 2024 23:14:01 +0530 Subject: [PATCH 57/77] don't use removeprefix --- src/black/linegen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index ec00345e3b5..298306a38e4 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -569,7 +569,7 @@ def __post_init__(self) -> None: def _fstring_to_string(node: Node) -> Leaf: """Converts an fstring node back to a string node.""" - string_without_prefix = str(node).removeprefix(node.prefix) + string_without_prefix = str(node)[len(node.prefix):] return Leaf(token.STRING, string_without_prefix, prefix=node.prefix) From dfd345581c5c9630ac7768194ffbd82c8e987892 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 25 Mar 2024 23:16:14 +0530 Subject: [PATCH 58/77] formatting --- src/black/linegen.py | 2 +- src/blib2to3/pgen2/tokenize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 298306a38e4..e388d32d440 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -569,7 +569,7 @@ def __post_init__(self) -> None: def _fstring_to_string(node: Node) -> Leaf: """Converts an fstring node back to a string node.""" - string_without_prefix = str(node)[len(node.prefix):] + string_without_prefix = str(node)[len(node.prefix) :] return Leaf(token.STRING, string_without_prefix, prefix=node.prefix) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 6e92b9ed22c..c20d7783937 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -142,7 +142,7 @@ def _combinations(*l: str) -> Set[str]: Double3Lbrace = r'[^"\\{]*(?:(?:\\.|{{|"(?!""))[^"\\{]*)*{(?!{)' # ! format specifier inside an fstring brace, ensure it's not a `!=` token -Bang = Whitespace + group("!") + r'(?!=)' +Bang = Whitespace + group("!") + r"(?!=)" bang = re.compile(Bang) Colon = Whitespace + group(":") colon = re.compile(Colon) From a81bae3bdbbcb2eac85d75375095b302218b14f1 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 25 Mar 2024 23:27:41 +0530 Subject: [PATCH 59/77] add minimum version --- tests/data/cases/pep_701.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index f7efbcaa954..3aa3f258912 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -1,4 +1,4 @@ -# flags: --target-version=py312 +# flags: --minimum-version=3.12 --target-version=py312 x = f"foo" x = f'foo' x = f"""foo""" From 0435144da8586ecb061190a9ab0a5bc00b13b256 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Thu, 28 Mar 2024 01:31:00 +0530 Subject: [PATCH 60/77] fix the one failing test --- src/blib2to3/Grammar.txt | 1 - src/blib2to3/pgen2/tokenize.py | 7 ++++--- tests/data/cases/pep_701.py | 6 ++++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/blib2to3/Grammar.txt b/src/blib2to3/Grammar.txt index 9ceebbd3d81..0c8ac99daba 100644 --- a/src/blib2to3/Grammar.txt +++ b/src/blib2to3/Grammar.txt @@ -256,7 +256,6 @@ patterns: pattern (',' pattern)* [','] pattern: (expr|star_expr) ['as' expr] fstring: FSTRING_START fstring_middle* FSTRING_END -# TODO making these FSTRING_MIDDLE makes them unformattable so maybe put a new token here? fstring_middle: fstring_replacement_field | FSTRING_MIDDLE fstring_replacement_field: '{' (yield_expr | testlist_star_expr) ['='] [ "!" NAME ] [ ':' fstring_format_spec* ] '}' fstring_format_spec: FSTRING_MIDDLE | fstring_replacement_field diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index c20d7783937..caa76de3b16 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -665,7 +665,6 @@ def generate_tokens( if endmatch: # all on one line start, end = endmatch.span(0) token = line[start:end] - # TODO: check if the token will ever have any whitespace around? if token.endswith(('"""', "'''")): middle_token, end_token = token[:-3], token[-3:] middle_epos = end_spos = (lnum, end - 3) @@ -707,8 +706,6 @@ def generate_tokens( contline = line break - # TODO: fstring_level > 0 is redundant in both cases here, - # remove it and ensure nothing breaks if inside_fstring_colon: match = fstring_middle_after_colon.match(line, pos) if match is None: @@ -982,6 +979,10 @@ def generate_tokens( else: if parenlev == 0 and bracelev > 0 and initial == "}": bracelev -= 1 + # if we're still inside fstrings, we're still part of the format spec + if inside_fstring_braces: + inside_fstring_colon = True + formatspec_start = (lnum, pos) elif initial in "([{": parenlev += 1 elif initial in ")]}": diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index 3aa3f258912..4839e907526 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -67,8 +67,11 @@ rf'foo' rf'{foo}' +f"{x:{y}d}" + x = f"a{2+2:=^{x}}b" x = f"a{2+2:=^{foo(x+y**2):something else}}b" +x = f"a{2+2:=^{foo(x+y**2):something else}one more}b" f'{(abc:=10)}' f"This is a really long string, but just make sure that you reflow fstrings { @@ -164,8 +167,11 @@ rf"foo" rf"{foo}" +f"{x:{y}d}" + x = f"a{2 + 2:=^{x}}b" x = f"a{2 + 2:=^{foo(x + y**2):something else}}b" +x = f"a{2 + 2:=^{foo(x + y**2):something else}one more}b" f"{(abc := 10)}" f"This is a really long string, but just make sure that you reflow fstrings {2 + 2:d}" From 99f8eb7468bd1b1fa0d87362bc01f9da559f947d Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Thu, 28 Mar 2024 19:38:36 +0530 Subject: [PATCH 61/77] fix couple more bugs --- src/blib2to3/pgen2/tokenize.py | 35 +++++++++++++++++++++++++++++----- tests/data/cases/pep_701.py | 8 ++++++++ 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index caa76de3b16..83885fa9cad 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -469,6 +469,15 @@ def is_fstring_start(token: str) -> bool: return builtins.any(token.startswith(prefix) for prefix in fstring_prefix) +def _split_fstring_start_and_middle(token: str) -> Tuple[str, str]: + for prefix in fstring_prefix: + _, prefix, rest = token.partition(prefix) + if prefix != "": + return prefix, rest + + raise ValueError(f"Token {token!r} is not a valid f-string start") + + def generate_tokens( readline: Callable[[], str], grammar: Optional[Grammar] = None ) -> Iterator[GoodTokenInfo]: @@ -531,11 +540,25 @@ def generate_tokens( spos = strstart epos = (lnum, end) tokenline = contline + line - if fstring_level == 0: + if fstring_level == 0 and not is_fstring_start(token): yield (STRING, token, spos, epos, tokenline) endprog_stack.pop() parenlev = parenlev_stack.pop() else: + if is_fstring_start(token): + fstring_level += 1 + fstring_start, token = _split_fstring_start_and_middle(token) + fstring_start_epos = (lnum, spos[1] + len(fstring_start)) + yield ( + FSTRING_START, + fstring_start, + spos, + fstring_start_epos, + tokenline, + ) + # increase spos to the end of the fstring start + spos = fstring_start_epos + if token.endswith("{"): fstring_middle, lbrace = token[:-1], token[-1] fstring_middle_epos = lbrace_spos = (lnum, end - 1) @@ -549,9 +572,12 @@ def generate_tokens( yield (LBRACE, lbrace, lbrace_spos, epos, line) inside_fstring_braces = True else: - # TODO: -3 maybe not guaranteed, could be \ separated single line string - fstring_middle, fstring_end = token[:-3], token[-3:] - fstring_middle_epos = end_spos = (lnum, end - 3) + if token.endswith(('"""', "'''")): + fstring_middle, fstring_end = token[:-3], token[-3:] + fstring_middle_epos = end_spos = (lnum, end - 3) + else: + fstring_middle, fstring_end = token[:-1], token[-1] + fstring_middle_epos = end_spos = (lnum, end - 1) yield ( FSTRING_MIDDLE, fstring_middle, @@ -792,7 +818,6 @@ def generate_tokens( if stashed: yield stashed stashed = None - # TODO: move this logic to a function if not is_fstring_start(token): pos = endmatch.end(0) token = line[start:pos] diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index 4839e907526..115f08d4a48 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -96,6 +96,9 @@ f" {lines_str!r}" ) +f"`escape` only permitted in {{'html', 'latex', 'latex-math'}}, \ +got {escape}" + # output x = f"foo" @@ -196,3 +199,8 @@ lines_str!r }" ) + +f"`escape` only permitted in {{'html', 'latex', 'latex-math'}}, \ +got { + escape +}" From 3e56204db7196cc5f9f2115dd0a2f0e1e74be658 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 3 Apr 2024 02:10:06 +0530 Subject: [PATCH 62/77] don't format fstrings at all --- src/black/__init__.py | 7 ++--- src/black/linegen.py | 44 +++++++++++++-------------- tests/data/cases/pep_701.py | 60 +++++++++++++++++-------------------- 3 files changed, 53 insertions(+), 58 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index fb614ba87a1..6ba49d5ef2d 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -1244,9 +1244,9 @@ def _format_str_once( future_imports = get_future_imports(src_node) versions = detect_target_versions(src_node, future_imports=future_imports) - line_generator_features = { + context_manager_features = { feature - for feature in {Feature.PARENTHESIZED_CONTEXT_MANAGERS, Feature.FSTRING_PARSING} + for feature in {Feature.PARENTHESIZED_CONTEXT_MANAGERS} if supports_feature(versions, feature) } normalize_fmt_off(src_node, mode, lines) @@ -1254,14 +1254,13 @@ def _format_str_once( # This should be called after normalize_fmt_off. convert_unchanged_lines(src_node, lines) - line_generator = LineGenerator(mode=mode, features=line_generator_features) + line_generator = LineGenerator(mode=mode, features=context_manager_features) elt = EmptyLineTracker(mode=mode) split_line_features = { feature for feature in { Feature.TRAILING_COMMA_IN_CALL, Feature.TRAILING_COMMA_IN_DEF, - Feature.FSTRING_PARSING, } if supports_feature(versions, feature) } diff --git a/src/black/linegen.py b/src/black/linegen.py index e388d32d440..1caa59207ca 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -503,36 +503,36 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: yield from self.visit_default(leaf) def visit_fstring(self, node: Node) -> Iterator[Line]: - if Feature.FSTRING_PARSING not in self.features: - string_leaf = _fstring_to_string(node) - node.replace(string_leaf) - yield from self.visit_default(string_leaf) - return + # currently we don't want to format and split f-strings at all. + string_leaf = _fstring_to_string(node) + node.replace(string_leaf) + yield from self.visit_default(string_leaf) - fstring_start = node.children[0] - fstring_end = node.children[-1] - assert isinstance(fstring_start, Leaf) - assert isinstance(fstring_end, Leaf) + # TODO: Uncomment Implementation to format f-string children + # fstring_start = node.children[0] + # fstring_end = node.children[-1] + # assert isinstance(fstring_start, Leaf) + # assert isinstance(fstring_end, Leaf) - quote_char = fstring_end.value[0] - quote_idx = fstring_start.value.index(quote_char) - prefix, quote = fstring_start.value[:quote_idx], fstring_start.value[quote_idx:] + # quote_char = fstring_end.value[0] + # quote_idx = fstring_start.value.index(quote_char) + # prefix, quote = fstring_start.value[:quote_idx], fstring_start.value[quote_idx:] - if not is_docstring(node, self.mode): - prefix = normalize_string_prefix(prefix) + # if not is_docstring(node, self.mode): + # prefix = normalize_string_prefix(prefix) - assert quote == fstring_end.value + # assert quote == fstring_end.value - is_raw_fstring = "r" in prefix or "R" in prefix - middles = [leaf for leaf in node.leaves() if leaf.type == token.FSTRING_MIDDLE] + # is_raw_fstring = "r" in prefix or "R" in prefix + # middles = [leaf for leaf in node.leaves() if leaf.type == token.FSTRING_MIDDLE] - if self.mode.string_normalization: - middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring) + # if self.mode.string_normalization: + # middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring) - fstring_start.value = prefix + quote - fstring_end.value = quote + # fstring_start.value = prefix + quote + # fstring_end.value = quote - yield from self.visit_default(node) + # yield from self.visit_default(node) def __post_init__(self) -> None: """You are in a twisty little maze of passages.""" diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index 115f08d4a48..927e0c4feca 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -114,13 +114,15 @@ # edge case: FSTRING_MIDDLE containing only whitespace should not be stripped x = f"{a} {b}" -x = f"foo {2 + 2} bar baz" +x = f"foo { + 2 + 2 +} bar baz" x = f"foo {{ {"a {2 + 2} b"}bar {{ baz" -x = f"foo {{ {f"a {2 + 2} b"}bar {{ baz" +x = f"foo {{ {f'a {2 + 2} b'}bar {{ baz" x = f"foo {{ {f"a {2 + 2} b"}bar {{ baz" -x = f"foo {{ {f"a {f"a {2 + 2} b"} b"}bar {{ baz" +x = f"foo {{ {f'a {f"a {2 + 2} b"} b'}bar {{ baz" x = f"foo {{ {f"a {f"a {2 + 2} b"} b"}bar {{ baz" x = """foo {{ {2 + 2}bar @@ -129,7 +131,9 @@ x = f"""foo {{ {2 + 2}bar {{ baz""" -x = f"""foo {{ {2 + 2}bar {{ baz""" +x = f"""foo {{ { + 2 + 2 +}bar {{ baz""" x = f"""foo {{ { @@ -138,49 +142,44 @@ baz""" x = f"""foo {{ a - foo { - 2 + 2 -}bar {{ baz + foo {2 + 2}bar {{ baz x = f"foo {{ { - 2 + 2 # comment -}bar" + 2 + 2 # comment + }bar" {{ baz }} buzz - { - print("abc" + "def") -} + {print("abc" + "def" +)} abc""" # edge case: end triple quotes at index zero -f"""foo { - 2 + 2 -} bar +f"""foo {2+2} bar """ -f" ' {f"'"} ' " -f' " {f'"'} " ' +f' \' {f"'"} \' ' +f" \" {f'"'} \" " -x = f"a{2 + 2:=^72}b" -x = f"a{2 + 2:x}b" +x = f"a{2+2:=^72}b" +x = f"a{2+2:x}b" rf"foo" rf"{foo}" f"{x:{y}d}" -x = f"a{2 + 2:=^{x}}b" -x = f"a{2 + 2:=^{foo(x + y**2):something else}}b" -x = f"a{2 + 2:=^{foo(x + y**2):something else}one more}b" -f"{(abc := 10)}" +x = f"a{2+2:=^{x}}b" +x = f"a{2+2:=^{foo(x+y**2):something else}}b" +x = f"a{2+2:=^{foo(x+y**2):something else}one more}b" +f"{(abc:=10)}" -f"This is a really long string, but just make sure that you reflow fstrings {2 + 2:d}" -f"This is a really long string, but just make sure that you reflow fstrings correctly { - 2 + 2:d +f"This is a really long string, but just make sure that you reflow fstrings { + 2+2:d }" +f"This is a really long string, but just make sure that you reflow fstrings correctly {2+2:d}" # TODO: Edge case: if the fstring replacement ends with a `=` it should not be touched # f"{2+2=}" @@ -195,12 +194,9 @@ # }""" raise ValueError( - "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" f" { - lines_str!r - }" + "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" + f" {lines_str!r}" ) f"`escape` only permitted in {{'html', 'latex', 'latex-math'}}, \ -got { - escape -}" +got {escape}" From 9495f5e2eeb7582dc03d8a78d04e1be4b73c78e8 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 3 Apr 2024 02:13:10 +0530 Subject: [PATCH 63/77] address comments --- src/black/linegen.py | 1 - src/blib2to3/pgen2/driver.py | 2 +- tests/util.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 1caa59207ca..55102c02019 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -65,7 +65,6 @@ from black.strings import ( fix_docstring, get_string_prefix, - normalize_fstring_quotes, normalize_string_prefix, normalize_string_quotes, normalize_unicode_escape_sequences, diff --git a/src/blib2to3/pgen2/driver.py b/src/blib2to3/pgen2/driver.py index 01b9dd00aa3..71a147cbcd8 100644 --- a/src/blib2to3/pgen2/driver.py +++ b/src/blib2to3/pgen2/driver.py @@ -167,7 +167,7 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> if type in {token.INDENT, token.DEDENT}: prefix = _prefix lineno, column = end - # FSTRING_MIDDLE is the only character that can end with a newline, and + # FSTRING_MIDDLE is the only token that can end with a newline, and # `end` will point to the next line. For that case, don't increment lineno. if value.endswith("\n") and type != token.FSTRING_MIDDLE: lineno += 1 diff --git a/tests/util.py b/tests/util.py index 3a6a0f9bd98..d5425f1f743 100644 --- a/tests/util.py +++ b/tests/util.py @@ -237,7 +237,7 @@ def get_flags_parser() -> argparse.ArgumentParser: "--target-version", action="append", type=lambda val: TargetVersion[val.upper()], - default=[], + default=(), ) parser.add_argument("--line-length", default=DEFAULT_LINE_LENGTH, type=int) parser.add_argument( From cf7648296c941be770725e0313f6a732b3556616 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 3 Apr 2024 02:17:00 +0530 Subject: [PATCH 64/77] flake8 --- src/black/linegen.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 55102c02019..2f2ae431818 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -515,7 +515,10 @@ def visit_fstring(self, node: Node) -> Iterator[Line]: # quote_char = fstring_end.value[0] # quote_idx = fstring_start.value.index(quote_char) - # prefix, quote = fstring_start.value[:quote_idx], fstring_start.value[quote_idx:] + # prefix, quote = ( + # fstring_start.value[:quote_idx], + # fstring_start.value[quote_idx:] + # ) # if not is_docstring(node, self.mode): # prefix = normalize_string_prefix(prefix) @@ -523,7 +526,11 @@ def visit_fstring(self, node: Node) -> Iterator[Line]: # assert quote == fstring_end.value # is_raw_fstring = "r" in prefix or "R" in prefix - # middles = [leaf for leaf in node.leaves() if leaf.type == token.FSTRING_MIDDLE] + # middles = [ + # leaf + # for leaf in node.leaves() + # if leaf.type == token.FSTRING_MIDDLE + # ] # if self.mode.string_normalization: # middles, quote = normalize_fstring_quotes(quote, middles, is_raw_fstring) From bbff3dee251bfb7f3b71a14d61dd27509d1e78ab Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 3 Apr 2024 02:39:47 +0530 Subject: [PATCH 65/77] fix failing test --- tests/data/cases/pep_701.py | 2 +- tests/util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index 927e0c4feca..b7660d13d8c 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -1,4 +1,4 @@ -# flags: --minimum-version=3.12 --target-version=py312 +# flags: --minimum-version=3.12 x = f"foo" x = f'foo' x = f"""foo""" diff --git a/tests/util.py b/tests/util.py index d5425f1f743..3a6a0f9bd98 100644 --- a/tests/util.py +++ b/tests/util.py @@ -237,7 +237,7 @@ def get_flags_parser() -> argparse.ArgumentParser: "--target-version", action="append", type=lambda val: TargetVersion[val.upper()], - default=(), + default=[], ) parser.add_argument("--line-length", default=DEFAULT_LINE_LENGTH, type=int) parser.add_argument( From 0fef83cff324c96019712fa681e4e2c2e09971bd Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 3 Apr 2024 03:02:22 +0530 Subject: [PATCH 66/77] undo default change --- tests/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/util.py b/tests/util.py index 3a6a0f9bd98..d5425f1f743 100644 --- a/tests/util.py +++ b/tests/util.py @@ -237,7 +237,7 @@ def get_flags_parser() -> argparse.ArgumentParser: "--target-version", action="append", type=lambda val: TargetVersion[val.upper()], - default=[], + default=(), ) parser.add_argument("--line-length", default=DEFAULT_LINE_LENGTH, type=int) parser.add_argument( From c5703609e784b675949ecaf34c35066cf9b80d4b Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 3 Apr 2024 03:03:32 +0530 Subject: [PATCH 67/77] remove todo --- src/black/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/strings.py b/src/black/strings.py index 031f3c31f2c..69a8c8002e9 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -210,7 +210,7 @@ def normalize_string_quotes(s: str) -> str: s = f"{prefix}{orig_quote}{body}{orig_quote}" new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body) new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body) - # TODO: can probably be removed + if "f" in prefix.casefold(): matches = re.findall( r""" From 2a697c8d815eb1377246d160631b4cbf22f3fb3d Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Fri, 5 Apr 2024 04:47:03 +0530 Subject: [PATCH 68/77] fix: \N{} case --- src/blib2to3/pgen2/tokenize.py | 16 +++++++++------- tests/data/cases/pep_701.py | 3 +++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 83885fa9cad..5e1676e2bc6 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -135,11 +135,13 @@ def _combinations(*l: str) -> Set[str]: _fstringlitprefix + "'''", ) -SingleLbrace = r"[^'\\{]*(?:(?:\\.|{{)[^'\\{]*)*{(?!{)" -DoubleLbrace = r'[^"\\{]*(?:(?:\\.|{{)[^"\\{]*)*{(?!{)' +# beginning of a single quoted f-string. must not end with `{{` or `\N{` +SingleLbrace = r"[^'\\{]*(?:(?:\\N{|\\.|{{)[^'\\{]*)*(? Set[str]: _string_middle_single = r"[^\n'\\]*(?:\\.[^\n'\\]*)*" _string_middle_double = r'[^\n"\\]*(?:\\.[^\n"\\]*)*' -# FSTRING_MIDDLE and LBRACE, inside a single quoted fstring -_fstring_middle_single = r"[^\n'\\{]*(?:(?:\\.|{{)[^\n'\\{]*)*({)(?!{)" -_fstring_middle_double = r'[^\n"\\{]*(?:(?:\\.|{{)[^\n"\\{]*)*({)(?!{)' +# FSTRING_MIDDLE and LBRACE, must not end with a `{{` or `\N{` +_fstring_middle_single = r"[^\n'\\{]*(?:(?:\\N{|\\.|{{)[^\n'\\{]*)*(? Date: Mon, 8 Apr 2024 23:44:38 +0530 Subject: [PATCH 69/77] make test a little better --- tests/data/cases/pep_701.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index 47af44b929e..0b6995bfd7e 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -99,7 +99,7 @@ f"`escape` only permitted in {{'html', 'latex', 'latex-math'}}, \ got {escape}" -x = f'\N{GREEK CAPITAL LETTER DELTA}' +x = f'\N{GREEK CAPITAL LETTER DELTA} \N{SNOWMAN} {x}' # output x = f"foo" @@ -202,4 +202,4 @@ f"`escape` only permitted in {{'html', 'latex', 'latex-math'}}, \ got {escape}" -x = f"\N{GREEK CAPITAL LETTER DELTA}" \ No newline at end of file +x = f"\N{GREEK CAPITAL LETTER DELTA} \N{SNOWMAN} {x}" From 1ab815b6dbe577242fcec4001ccb521182d76a4d Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 14 Apr 2024 16:16:08 +0530 Subject: [PATCH 70/77] tweak regex to fix edge cases --- src/blib2to3/pgen2/tokenize.py | 8 ++++---- tests/data/cases/pep_701.py | 17 +++++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 5e1676e2bc6..565f2cf2835 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -140,8 +140,8 @@ def _combinations(*l: str) -> Set[str]: DoubleLbrace = r'[^"\\{]*(?:(?:\\N{|\\.|{{)[^"\\{]*)*(? Set[str]: _string_middle_double = r'[^\n"\\]*(?:\\.[^\n"\\]*)*' # FSTRING_MIDDLE and LBRACE, must not end with a `{{` or `\N{` -_fstring_middle_single = r"[^\n'\\{]*(?:(?:\\N{|\\.|{{)[^\n'\\{]*)*(? Date: Mon, 22 Apr 2024 11:15:05 +0530 Subject: [PATCH 71/77] fix edge case with nested multiline strings --- src/blib2to3/pgen2/tokenize.py | 2 +- tests/data/cases/pep_701.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 565f2cf2835..d8690f8bcdd 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -530,7 +530,7 @@ def generate_tokens( lnum += 1 pos, max = 0, len(line) - if contstr and not inside_fstring_braces: # continued string + if contstr: # continued string assert contline is not None if not line: raise TokenError("EOF in multi-line string", strstart) diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index a2ce38bd25b..14e0e0abcdc 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -101,6 +101,11 @@ x = f'\N{GREEK CAPITAL LETTER DELTA} \N{SNOWMAN} {x}' fr'\{{\}}' +f""" + WITH {f''' + {1}_cte AS ()'''} +""" + # output x = f"foo" @@ -204,3 +209,8 @@ x = f"\N{GREEK CAPITAL LETTER DELTA} \N{SNOWMAN} {x}" rf"\{{\}}" + +f""" + WITH {f''' + {1}_cte AS ()'''} +""" \ No newline at end of file From a64939d4adf50d1edbd1f579fdf0952b2b9545cf Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 22 Apr 2024 11:15:55 +0530 Subject: [PATCH 72/77] whitespace --- tests/data/cases/pep_701.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index 14e0e0abcdc..ccd3ec40447 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -213,4 +213,4 @@ f""" WITH {f''' {1}_cte AS ()'''} -""" \ No newline at end of file +""" From 7df45fb2b31adc027f02c3a164ba6554066325ee Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 22 Apr 2024 16:24:25 +0530 Subject: [PATCH 73/77] fix multiline formatspec todo --- src/blib2to3/pgen2/tokenize.py | 8 ++++---- tests/data/cases/pep_701.py | 22 ++++++++++------------ 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index d8690f8bcdd..d6b684ab1aa 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -149,7 +149,7 @@ def _combinations(*l: str) -> Set[str]: Colon = Whitespace + group(":") colon = re.compile(Colon) -FstringMiddleAfterColon = group(Whitespace + r".*?") + group("{", "}", "\n") +FstringMiddleAfterColon = group(Whitespace + r".*?") + group("{", "}") fstring_middle_after_colon = re.compile(FstringMiddleAfterColon) # Because of leftmost-then-longest match semantics, be sure to put the @@ -737,7 +737,9 @@ def generate_tokens( if inside_fstring_colon: match = fstring_middle_after_colon.match(line, pos) if match is None: - raise TokenError("unterminated f-string literal", (lnum, pos)) + formatspec += line[pos:] + pos = max + continue start, end = match.span(1) token = line[start:end] @@ -746,8 +748,6 @@ def generate_tokens( brace_start, brace_end = match.span(2) brace_or_nl = line[brace_start:brace_end] if brace_or_nl == "\n": - # TODO: in a triple quoted string we should infact add the \n here - # formatspec += "\n" pos = brace_end yield (FSTRING_MIDDLE, formatspec, formatspec_start, (lnum, end), line) diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index ccd3ec40447..bfd2a7ad7b7 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -83,12 +83,11 @@ f"{2+2 = }" f"{ 2 + 2 = }" -# TODO: -# f"""foo { -# datetime.datetime.now():%Y -# %m -# %d -# }""" +f"""foo { + datetime.datetime.now():%Y +%m +%d +}""" raise ValueError( "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" @@ -192,12 +191,11 @@ f"{2+2 = }" f"{ 2 + 2 = }" -# TODO: -# f"""foo { -# datetime.datetime.now():%Y -# %m -# %d -# }""" +f"""foo { + datetime.datetime.now():%Y +%m +%d +}""" raise ValueError( "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" From 36e04d2e5e1cac0e516aed5955d971cb1a03a842 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 22 Apr 2024 16:25:44 +0530 Subject: [PATCH 74/77] add another test case --- tests/data/cases/pep_701.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py index bfd2a7ad7b7..c5bc48e95f2 100644 --- a/tests/data/cases/pep_701.py +++ b/tests/data/cases/pep_701.py @@ -89,6 +89,11 @@ %d }""" +f"{ +X +!r +}" + raise ValueError( "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" f" {lines_str!r}" @@ -197,6 +202,11 @@ %d }""" +f"{ +X +!r +}" + raise ValueError( "xxxxxxxxxxxIncorrect --line-ranges format, expect START-END, found" f" {lines_str!r}" From eb05cd4cb250fc22657ee47d0ed1c3ae2fa714d2 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 22 Apr 2024 16:51:45 +0530 Subject: [PATCH 75/77] Revert "Remove node-specific logic from visit_default (#4321)" This reverts commit 7134754ef45078b032039ad858bdaaef146233b2. --- src/black/linegen.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 64db7b6208c..2f2ae431818 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -152,6 +152,11 @@ def visit_default(self, node: LN) -> Iterator[Line]: if any_open_brackets: node.prefix = "" + if self.mode.string_normalization and node.type == token.STRING: + node.value = normalize_string_prefix(node.value) + node.value = normalize_string_quotes(node.value) + if node.type == token.NUMBER: + normalize_numeric_literal(node) if node.type not in WHITESPACE: self.current_line.append(node) yield from super().visit_default(node) @@ -415,11 +420,12 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: # indentation of those changes the AST representation of the code. if self.mode.string_normalization: docstring = normalize_string_prefix(leaf.value) - # We handle string normalization at the end of this method, but since - # what we do right now acts differently depending on quote style (ex. + # visit_default() does handle string normalization for us, but + # since this method acts differently depending on quote style (ex. # see padding logic below), there's a possibility for unstable - # formatting. To avoid a situation where this function formats a - # docstring differently on the second pass, normalize it early. + # formatting as visit_default() is called *after*. To avoid a + # situation where this function formats a docstring differently on + # the second pass, normalize it early. docstring = normalize_string_quotes(docstring) else: docstring = leaf.value @@ -493,13 +499,6 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: else: leaf.value = prefix + quote + docstring + quote - if self.mode.string_normalization and leaf.type == token.STRING: - leaf.value = normalize_string_prefix(leaf.value) - leaf.value = normalize_string_quotes(leaf.value) - yield from self.visit_default(leaf) - - def visit_NUMBER(self, leaf: Leaf) -> Iterator[Line]: - normalize_numeric_literal(leaf) yield from self.visit_default(leaf) def visit_fstring(self, node: Node) -> Iterator[Line]: From 5d727ec86639c553187c9a0b976aafddbd00abc1 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Mon, 22 Apr 2024 07:59:07 -0700 Subject: [PATCH 76/77] Revert "Revert "Remove node-specific logic from visit_default (#4321)"" This reverts commit eb05cd4cb250fc22657ee47d0ed1c3ae2fa714d2. --- src/black/linegen.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 2f2ae431818..64db7b6208c 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -152,11 +152,6 @@ def visit_default(self, node: LN) -> Iterator[Line]: if any_open_brackets: node.prefix = "" - if self.mode.string_normalization and node.type == token.STRING: - node.value = normalize_string_prefix(node.value) - node.value = normalize_string_quotes(node.value) - if node.type == token.NUMBER: - normalize_numeric_literal(node) if node.type not in WHITESPACE: self.current_line.append(node) yield from super().visit_default(node) @@ -420,12 +415,11 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: # indentation of those changes the AST representation of the code. if self.mode.string_normalization: docstring = normalize_string_prefix(leaf.value) - # visit_default() does handle string normalization for us, but - # since this method acts differently depending on quote style (ex. + # We handle string normalization at the end of this method, but since + # what we do right now acts differently depending on quote style (ex. # see padding logic below), there's a possibility for unstable - # formatting as visit_default() is called *after*. To avoid a - # situation where this function formats a docstring differently on - # the second pass, normalize it early. + # formatting. To avoid a situation where this function formats a + # docstring differently on the second pass, normalize it early. docstring = normalize_string_quotes(docstring) else: docstring = leaf.value @@ -499,6 +493,13 @@ def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: else: leaf.value = prefix + quote + docstring + quote + if self.mode.string_normalization and leaf.type == token.STRING: + leaf.value = normalize_string_prefix(leaf.value) + leaf.value = normalize_string_quotes(leaf.value) + yield from self.visit_default(leaf) + + def visit_NUMBER(self, leaf: Leaf) -> Iterator[Line]: + normalize_numeric_literal(leaf) yield from self.visit_default(leaf) def visit_fstring(self, node: Node) -> Iterator[Line]: From ab2f43c51f9385188ae952e5354c1d8955a9b8a0 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Mon, 22 Apr 2024 08:00:41 -0700 Subject: [PATCH 77/77] fix --- src/black/linegen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 64db7b6208c..4b29a049dba 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -506,7 +506,7 @@ def visit_fstring(self, node: Node) -> Iterator[Line]: # currently we don't want to format and split f-strings at all. string_leaf = _fstring_to_string(node) node.replace(string_leaf) - yield from self.visit_default(string_leaf) + yield from self.visit_STRING(string_leaf) # TODO: Uncomment Implementation to format f-string children # fstring_start = node.children[0]