Skip to content

Commit

Permalink
Reimplement "Dependencies.strip_string_literals()" to handle unclosed…
Browse files Browse the repository at this point in the history
… string literals and recursive f-strings.

Closes cython#5977
  • Loading branch information
scoder committed Feb 11, 2024
1 parent 4c87330 commit c15cef2
Show file tree
Hide file tree
Showing 2 changed files with 227 additions and 78 deletions.
192 changes: 115 additions & 77 deletions Cython/Build/Dependencies.py
Expand Up @@ -306,94 +306,132 @@ def apply(self, extension):
setattr(extension, key, value)


@cython.locals(start=cython.Py_ssize_t, q=cython.Py_ssize_t,
single_q=cython.Py_ssize_t, double_q=cython.Py_ssize_t,
hash_mark=cython.Py_ssize_t, end=cython.Py_ssize_t,
k=cython.Py_ssize_t, counter=cython.Py_ssize_t, quote_len=cython.Py_ssize_t)
_FIND_TOKEN = cython.declare(object, re.compile(r"""
(?P<comment> [#] ) |
(?P<braces> [}]+ ) |
(?P<fstring> f )? (?P<quote> '+ | "+ )
""", re.VERBOSE).search)

_FIND_STRING_TOKEN = cython.declare(object, re.compile(r"""
(?P<escape> [\\]+ ) (?P<escaped_quote> ['"] ) |
(?P<fstring> f )? (?P<quote> '+ | "+ )
""", re.VERBOSE).search)

_FIND_FSTRING_TOKEN = cython.declare(object, re.compile(r"""
(?P<braces> [{]+ | [}]+ ) |
(?P<escape> [\\]+ ) (?P<escaped_quote> ['"] ) |
(?P<fstring> f )? (?P<quote> '+ | "+ )
""", re.VERBOSE).search)


@cython.locals(single_q=cython.Py_ssize_t, double_q=cython.Py_ssize_t,
hash_mark=cython.Py_ssize_t, k=cython.Py_ssize_t)
def strip_string_literals(code, prefix='__Pyx_L'):
"""
Normalizes every string literal to be of the form '__Pyx_Lxxx',
returning the normalized code and a mapping of labels to
string literals.
"""
new_code = []
literals = {}
counter = 0
start = q = 0
in_quote = False
hash_mark = single_q = double_q = -1
code_len = len(code)
quote_type = None
quote_len = -1

while True:
if hash_mark < q:
hash_mark = code.find('#', q)
if single_q < q:
single_q = code.find("'", q)
if double_q < q:
double_q = code.find('"', q)
q = min(single_q, double_q)
if q == -1:
q = max(single_q, double_q)

# We're done.
if q == -1 and hash_mark == -1:
new_code.append(code[start:])
break

# Try to close the quote.
elif in_quote:
if code[q-1] == '\\':
k = 2
while q >= k and code[q-k] == '\\':
k += 1
if k % 2 == 0:
q += 1
new_code: list = []
literals: dict = {}
counter: cython.Py_ssize_t = 0
find_token = _FIND_TOKEN

def new_label(literal) -> str:
nonlocal counter
counter += 1
label = f"{prefix}{counter}_"
literals[label] = literal
return label

def parse_string(quote_type: str, start: cython.Py_ssize_t, is_fstring: cython.bint) -> cython.Py_ssize_t:
charpos: cython.Py_ssize_t = start
end: cython.Py_ssize_t

find_token = _FIND_FSTRING_TOKEN if is_fstring else _FIND_STRING_TOKEN

while charpos != -1:
token = find_token(code, charpos)
if token is None:
# This probably indicates an unclosed string literal, i.e. a broken file.
new_code.append(new_label(code[start:]))
charpos = -1
break
end = token.end()

if token['escape']:
charpos = end
if len(token['escape']) % 2 == 0 and token['escaped_quote'] == quote_type[0]:
# quote is not actually escaped and might be part of a terminator, look at it next
charpos -= 1
elif is_fstring and token['braces']:
# formats or brace(s) in fstring
charpos = end
if len(token['braces']) % 2 == 0:
# normal brace characters in string
continue
if code[q] == quote_type and (
quote_len == 1 or (code_len > q + 2 and quote_type == code[q+1] == code[q+2])):
counter += 1
label = "%s%s_" % (prefix, counter)
literals[label] = code[start+quote_len:q]
full_quote = code[q:q+quote_len]
new_code.append(full_quote)
new_code.append(label)
new_code.append(full_quote)
q += quote_len
in_quote = False
start = q
else:
q += 1

# Process comment.
elif -1 != hash_mark and (hash_mark < q or q == -1):
new_code.append(code[start:hash_mark+1])
end = code.find('\n', hash_mark)
counter += 1
label = "%s%s_" % (prefix, counter)
if end == -1:
end_or_none = None
if token['braces'][-1] == '{':
if start < end-1:
new_code.append(new_label(code[start : end-1]))
new_code.append('{')
start = charpos = parse_code(end, in_fstring=True)
elif token['quote'].startswith(quote_type):
# closing quote found (potentially together with further, unrelated quotes)
charpos = token.start('quote')
if charpos > start:
new_code.append(new_label(code[start : charpos]))
new_code.append(quote_type)
charpos += len(quote_type)
break
else:
end_or_none = end
literals[label] = code[hash_mark+1:end_or_none]
new_code.append(label)
if end == -1:
# string internal quote(s)
charpos = end

return charpos

def parse_code(start: cython.Py_ssize_t, in_fstring: cython.bint = False) -> cython.Py_ssize_t:
charpos: cython.Py_ssize_t = start
end: cython.Py_ssize_t
quote: str

while charpos != -1:
token = find_token(code, charpos)
if token is None:
new_code.append(code[start:])
charpos = -1
break
start = q = end
charpos = end = token.end()

if token['quote']:
quote = token['quote']
if len(quote) >= 6:
# ignore empty tripple-quoted strings: '''''' or """"""
quote = quote[:len(quote) % 6]
if quote and len(quote) != 2:
if len(quote) > 3:
end -= len(quote) - 3
quote = quote[:3]
new_code.append(code[start:end])
start = charpos = parse_string(quote, end, is_fstring=token['fstring'])

elif token['comment']:
new_code.append(code[start:end+1])
charpos = code.find('\n', end+1)
new_code.append(new_label(code[end+1 : charpos if charpos != -1 else None]))
if charpos == -1:
break # EOF
start = charpos

elif token['braces']:
if in_fstring and len(token['braces']) % 2 == 1:
# closing '}' of f-string
charpos = end = token.start() + 1
new_code.append(code[start:end]) # with '}'
break

# Open the quote.
else:
if code_len >= q+3 and (code[q] == code[q+1] == code[q+2]):
quote_len = 3
else:
quote_len = 1
in_quote = True
quote_type = code[q]
new_code.append(code[start:q])
start = q
q += quote_len
return charpos

parse_code(0)
return "".join(new_code), literals


Expand Down
113 changes: 112 additions & 1 deletion Cython/Build/Tests/TestDependencies.py
@@ -1,10 +1,13 @@
import contextlib
import os.path
import pathlib
import re
import tempfile
import unittest
from os.path import join as pjoin

from ..Dependencies import extended_iglob
from ...Utils import open_source_file
from ..Dependencies import extended_iglob, strip_string_literals


@contextlib.contextmanager
Expand Down Expand Up @@ -131,3 +134,111 @@ def test_extended_iglob_double_star(self):
self.files_equal("**/*.{py}", all_files[1::2])
self.files_equal("*/*/*.py", files[1::2])
self.files_equal("**/*.py", all_files[1::2])


class TestCodeProcessing(unittest.TestCase):
maxDiff = None

@staticmethod
def _rebuild_string(stripped, literals):
def lookup(match):
return literals[match.group()]

return re.sub("__Pyx_L[0-9]+_", lookup, stripped)

def test_strip_string_literals(self):
def strip_equals(s, expected):
stripped, literals = strip_string_literals(s)
self.assertEqual(expected, stripped)

recovered = self._rebuild_string(stripped, literals)
self.assertEqual(s, recovered)

unchanged = [
"""abc""",
"""123""",
"""func(123)""",
""" '' """,
""" '''''''''''' """,
""" '''''''''''''' """,
]

tests = [(code, code) for code in unchanged] + [
(""" '''' ''' """,
""" '''__Pyx_L1_''' """),

(''' """" """ ''',
''' """__Pyx_L1_""" '''),

(""" func('xyz') + " " + "" '' # '' | "" "123" 'xyz' "' """,
""" func('__Pyx_L1_') + "__Pyx_L2_" + "" '' # __Pyx_L3_"""),

(""" f'f' """,
""" f'__Pyx_L1_' """),

(""" f'a{123}b' """,
""" f'__Pyx_L1_{123}__Pyx_L2_' """),

(""" f'{1}{f'xyz'}' """,
""" f'{1}{f'__Pyx_L1_'}' """),

(""" f'{f'''xyz{f\"""abc\"""}'''}' """,
""" f'{f'''__Pyx_L1_{f\"""__Pyx_L2_\"""}'''}' """),

(""" f'{{{{{"abc"}}}}}{{}}{{' == '{{abc}}{}{' """,
""" f'__Pyx_L1_{"__Pyx_L2_"}__Pyx_L3_' == '__Pyx_L4_' """),
]

for code, expected in tests:
with self.subTest(code=code):
strip_equals(code, expected)
code = code.strip()
expected = expected.strip()
with self.subTest(code=code):
strip_equals(code, expected)
code += "\n"
expected += "\n"
with self.subTest(code=code):
strip_equals(code, expected)

# GH5977: unclosed string literal
strip_equals(
""" print("Say something: %s' % something) """,
""" print("__Pyx_L1_"""
)


def _test_all_files(self, base_dir, file_paths):
_find_leftover_string = re.compile(r"""[^_'"}](['"]+)[^_'"{]""").search
for file_path in sorted(file_paths):
with self.subTest(file=str(file_path.relative_to(base_dir))):
with open_source_file(str(file_path)) as f:
code = f.read()
stripped, literals = strip_string_literals(code)

match = _find_leftover_string(stripped)
if match and len(match.group(1)) != 2:
match_pos = match.start() + 1
self.fail(f"Leftover string found: {stripped[match_pos - 12 : match_pos + 12]!r}")

recovered = self._rebuild_string(stripped, literals)
self.assertEqual(code, recovered)


def test_strip_string_literals_py_files(self):
# process all .py files in the Cython package
package_dir = pathlib.Path(__file__).absolute().parents[2]
assert package_dir.name == 'Cython'
base_dir = package_dir.parent
self._test_all_files(base_dir, package_dir.rglob("*.py"))

def test_strip_string_literals_test_files(self):
# process all .py[x] files in the tests package
base_dir = pathlib.Path(__file__).absolute().parents[3]
tests_dir = base_dir / 'tests'
test_files = []
for test_subdir in tests_dir.iterdir():
if test_subdir.is_dir() and test_subdir.name != 'errors':
test_files.extend(test_subdir.rglob("*.py"))
test_files.extend(test_subdir.rglob("*.pyx"))
self._test_all_files(base_dir, test_files)

0 comments on commit c15cef2

Please sign in to comment.