From c15cef2915335581c368006b1815baa97dbbab17 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 11 Feb 2024 14:18:09 +0100 Subject: [PATCH] Reimplement "Dependencies.strip_string_literals()" to handle unclosed string literals and recursive f-strings. Closes https://github.com/cython/cython/issues/5977 --- Cython/Build/Dependencies.py | 192 +++++++++++++++---------- Cython/Build/Tests/TestDependencies.py | 113 ++++++++++++++- 2 files changed, 227 insertions(+), 78 deletions(-) diff --git a/Cython/Build/Dependencies.py b/Cython/Build/Dependencies.py index 48aeeb7d938..8749a093a03 100644 --- a/Cython/Build/Dependencies.py +++ b/Cython/Build/Dependencies.py @@ -306,94 +306,132 @@ def apply(self, extension): setattr(extension, key, value) -@cython.locals(start=cython.Py_ssize_t, q=cython.Py_ssize_t, - single_q=cython.Py_ssize_t, double_q=cython.Py_ssize_t, - hash_mark=cython.Py_ssize_t, end=cython.Py_ssize_t, - k=cython.Py_ssize_t, counter=cython.Py_ssize_t, quote_len=cython.Py_ssize_t) +_FIND_TOKEN = cython.declare(object, re.compile(r""" + (?P [#] ) | + (?P [}]+ ) | + (?P f )? (?P '+ | "+ ) +""", re.VERBOSE).search) + +_FIND_STRING_TOKEN = cython.declare(object, re.compile(r""" + (?P [\\]+ ) (?P ['"] ) | + (?P f )? (?P '+ | "+ ) +""", re.VERBOSE).search) + +_FIND_FSTRING_TOKEN = cython.declare(object, re.compile(r""" + (?P [{]+ | [}]+ ) | + (?P [\\]+ ) (?P ['"] ) | + (?P f )? (?P '+ | "+ ) +""", re.VERBOSE).search) + + +@cython.locals(single_q=cython.Py_ssize_t, double_q=cython.Py_ssize_t, + hash_mark=cython.Py_ssize_t, k=cython.Py_ssize_t) def strip_string_literals(code, prefix='__Pyx_L'): """ Normalizes every string literal to be of the form '__Pyx_Lxxx', returning the normalized code and a mapping of labels to string literals. """ - new_code = [] - literals = {} - counter = 0 - start = q = 0 - in_quote = False - hash_mark = single_q = double_q = -1 - code_len = len(code) - quote_type = None - quote_len = -1 - - while True: - if hash_mark < q: - hash_mark = code.find('#', q) - if single_q < q: - single_q = code.find("'", q) - if double_q < q: - double_q = code.find('"', q) - q = min(single_q, double_q) - if q == -1: - q = max(single_q, double_q) - - # We're done. - if q == -1 and hash_mark == -1: - new_code.append(code[start:]) - break - - # Try to close the quote. - elif in_quote: - if code[q-1] == '\\': - k = 2 - while q >= k and code[q-k] == '\\': - k += 1 - if k % 2 == 0: - q += 1 + new_code: list = [] + literals: dict = {} + counter: cython.Py_ssize_t = 0 + find_token = _FIND_TOKEN + + def new_label(literal) -> str: + nonlocal counter + counter += 1 + label = f"{prefix}{counter}_" + literals[label] = literal + return label + + def parse_string(quote_type: str, start: cython.Py_ssize_t, is_fstring: cython.bint) -> cython.Py_ssize_t: + charpos: cython.Py_ssize_t = start + end: cython.Py_ssize_t + + find_token = _FIND_FSTRING_TOKEN if is_fstring else _FIND_STRING_TOKEN + + while charpos != -1: + token = find_token(code, charpos) + if token is None: + # This probably indicates an unclosed string literal, i.e. a broken file. + new_code.append(new_label(code[start:])) + charpos = -1 + break + end = token.end() + + if token['escape']: + charpos = end + if len(token['escape']) % 2 == 0 and token['escaped_quote'] == quote_type[0]: + # quote is not actually escaped and might be part of a terminator, look at it next + charpos -= 1 + elif is_fstring and token['braces']: + # formats or brace(s) in fstring + charpos = end + if len(token['braces']) % 2 == 0: + # normal brace characters in string continue - if code[q] == quote_type and ( - quote_len == 1 or (code_len > q + 2 and quote_type == code[q+1] == code[q+2])): - counter += 1 - label = "%s%s_" % (prefix, counter) - literals[label] = code[start+quote_len:q] - full_quote = code[q:q+quote_len] - new_code.append(full_quote) - new_code.append(label) - new_code.append(full_quote) - q += quote_len - in_quote = False - start = q - else: - q += 1 - - # Process comment. - elif -1 != hash_mark and (hash_mark < q or q == -1): - new_code.append(code[start:hash_mark+1]) - end = code.find('\n', hash_mark) - counter += 1 - label = "%s%s_" % (prefix, counter) - if end == -1: - end_or_none = None + if token['braces'][-1] == '{': + if start < end-1: + new_code.append(new_label(code[start : end-1])) + new_code.append('{') + start = charpos = parse_code(end, in_fstring=True) + elif token['quote'].startswith(quote_type): + # closing quote found (potentially together with further, unrelated quotes) + charpos = token.start('quote') + if charpos > start: + new_code.append(new_label(code[start : charpos])) + new_code.append(quote_type) + charpos += len(quote_type) + break else: - end_or_none = end - literals[label] = code[hash_mark+1:end_or_none] - new_code.append(label) - if end == -1: + # string internal quote(s) + charpos = end + + return charpos + + def parse_code(start: cython.Py_ssize_t, in_fstring: cython.bint = False) -> cython.Py_ssize_t: + charpos: cython.Py_ssize_t = start + end: cython.Py_ssize_t + quote: str + + while charpos != -1: + token = find_token(code, charpos) + if token is None: + new_code.append(code[start:]) + charpos = -1 break - start = q = end + charpos = end = token.end() + + if token['quote']: + quote = token['quote'] + if len(quote) >= 6: + # ignore empty tripple-quoted strings: '''''' or """""" + quote = quote[:len(quote) % 6] + if quote and len(quote) != 2: + if len(quote) > 3: + end -= len(quote) - 3 + quote = quote[:3] + new_code.append(code[start:end]) + start = charpos = parse_string(quote, end, is_fstring=token['fstring']) + + elif token['comment']: + new_code.append(code[start:end+1]) + charpos = code.find('\n', end+1) + new_code.append(new_label(code[end+1 : charpos if charpos != -1 else None])) + if charpos == -1: + break # EOF + start = charpos + + elif token['braces']: + if in_fstring and len(token['braces']) % 2 == 1: + # closing '}' of f-string + charpos = end = token.start() + 1 + new_code.append(code[start:end]) # with '}' + break - # Open the quote. - else: - if code_len >= q+3 and (code[q] == code[q+1] == code[q+2]): - quote_len = 3 - else: - quote_len = 1 - in_quote = True - quote_type = code[q] - new_code.append(code[start:q]) - start = q - q += quote_len + return charpos + parse_code(0) return "".join(new_code), literals diff --git a/Cython/Build/Tests/TestDependencies.py b/Cython/Build/Tests/TestDependencies.py index bc6b99bab45..e0638d38705 100644 --- a/Cython/Build/Tests/TestDependencies.py +++ b/Cython/Build/Tests/TestDependencies.py @@ -1,10 +1,13 @@ import contextlib import os.path +import pathlib +import re import tempfile import unittest from os.path import join as pjoin -from ..Dependencies import extended_iglob +from ...Utils import open_source_file +from ..Dependencies import extended_iglob, strip_string_literals @contextlib.contextmanager @@ -131,3 +134,111 @@ def test_extended_iglob_double_star(self): self.files_equal("**/*.{py}", all_files[1::2]) self.files_equal("*/*/*.py", files[1::2]) self.files_equal("**/*.py", all_files[1::2]) + + +class TestCodeProcessing(unittest.TestCase): + maxDiff = None + + @staticmethod + def _rebuild_string(stripped, literals): + def lookup(match): + return literals[match.group()] + + return re.sub("__Pyx_L[0-9]+_", lookup, stripped) + + def test_strip_string_literals(self): + def strip_equals(s, expected): + stripped, literals = strip_string_literals(s) + self.assertEqual(expected, stripped) + + recovered = self._rebuild_string(stripped, literals) + self.assertEqual(s, recovered) + + unchanged = [ + """abc""", + """123""", + """func(123)""", + """ '' """, + """ '''''''''''' """, + """ '''''''''''''' """, + ] + + tests = [(code, code) for code in unchanged] + [ + (""" '''' ''' """, + """ '''__Pyx_L1_''' """), + + (''' """" """ ''', + ''' """__Pyx_L1_""" '''), + + (""" func('xyz') + " " + "" '' # '' | "" "123" 'xyz' "' """, + """ func('__Pyx_L1_') + "__Pyx_L2_" + "" '' # __Pyx_L3_"""), + + (""" f'f' """, + """ f'__Pyx_L1_' """), + + (""" f'a{123}b' """, + """ f'__Pyx_L1_{123}__Pyx_L2_' """), + + (""" f'{1}{f'xyz'}' """, + """ f'{1}{f'__Pyx_L1_'}' """), + + (""" f'{f'''xyz{f\"""abc\"""}'''}' """, + """ f'{f'''__Pyx_L1_{f\"""__Pyx_L2_\"""}'''}' """), + + (""" f'{{{{{"abc"}}}}}{{}}{{' == '{{abc}}{}{' """, + """ f'__Pyx_L1_{"__Pyx_L2_"}__Pyx_L3_' == '__Pyx_L4_' """), + ] + + for code, expected in tests: + with self.subTest(code=code): + strip_equals(code, expected) + code = code.strip() + expected = expected.strip() + with self.subTest(code=code): + strip_equals(code, expected) + code += "\n" + expected += "\n" + with self.subTest(code=code): + strip_equals(code, expected) + + # GH5977: unclosed string literal + strip_equals( + """ print("Say something: %s' % something) """, + """ print("__Pyx_L1_""" + ) + + + def _test_all_files(self, base_dir, file_paths): + _find_leftover_string = re.compile(r"""[^_'"}](['"]+)[^_'"{]""").search + for file_path in sorted(file_paths): + with self.subTest(file=str(file_path.relative_to(base_dir))): + with open_source_file(str(file_path)) as f: + code = f.read() + stripped, literals = strip_string_literals(code) + + match = _find_leftover_string(stripped) + if match and len(match.group(1)) != 2: + match_pos = match.start() + 1 + self.fail(f"Leftover string found: {stripped[match_pos - 12 : match_pos + 12]!r}") + + recovered = self._rebuild_string(stripped, literals) + self.assertEqual(code, recovered) + + + def test_strip_string_literals_py_files(self): + # process all .py files in the Cython package + package_dir = pathlib.Path(__file__).absolute().parents[2] + assert package_dir.name == 'Cython' + base_dir = package_dir.parent + self._test_all_files(base_dir, package_dir.rglob("*.py")) + + def test_strip_string_literals_test_files(self): + # process all .py[x] files in the tests package + base_dir = pathlib.Path(__file__).absolute().parents[3] + tests_dir = base_dir / 'tests' + test_files = [] + for test_subdir in tests_dir.iterdir(): + if test_subdir.is_dir() and test_subdir.name != 'errors': + test_files.extend(test_subdir.rglob("*.py")) + test_files.extend(test_subdir.rglob("*.pyx")) + self._test_all_files(base_dir, test_files)