From c15cef2915335581c368006b1815baa97dbbab17 Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Sun, 11 Feb 2024 14:18:09 +0100
Subject: [PATCH] Reimplement "Dependencies.strip_string_literals()" to handle
 unclosed string literals and recursive f-strings.

Closes https://github.com/cython/cython/issues/5977
---
 Cython/Build/Dependencies.py           | 192 +++++++++++++++----------
 Cython/Build/Tests/TestDependencies.py | 113 ++++++++++++++-
 2 files changed, 227 insertions(+), 78 deletions(-)
diff --git a/Cython/Build/Dependencies.py b/Cython/Build/Dependencies.py
index 48aeeb7d938..8749a093a03 100644
--- a/Cython/Build/Dependencies.py
+++ b/Cython/Build/Dependencies.py
@@ -306,94 +306,132 @@ def apply(self, extension):
             setattr(extension, key, value)
 
 
-@cython.locals(start=cython.Py_ssize_t, q=cython.Py_ssize_t,
-               single_q=cython.Py_ssize_t, double_q=cython.Py_ssize_t,
-               hash_mark=cython.Py_ssize_t, end=cython.Py_ssize_t,
-               k=cython.Py_ssize_t, counter=cython.Py_ssize_t, quote_len=cython.Py_ssize_t)
+_FIND_TOKEN = cython.declare(object, re.compile(r"""
+    (?P<comment> [#] ) |
+    (?P<braces> [}]+ ) |
+    (?P<fstring> f )? (?P<quote> '+ | "+ )
+""", re.VERBOSE).search)
+
+_FIND_STRING_TOKEN = cython.declare(object, re.compile(r"""
+    (?P<escape> [\\]+ ) (?P<escaped_quote> ['"] ) |
+    (?P<fstring> f )? (?P<quote> '+ | "+ )
+""", re.VERBOSE).search)
+
+_FIND_FSTRING_TOKEN = cython.declare(object, re.compile(r"""
+    (?P<braces> [{]+ | [}]+ ) |
+    (?P<escape> [\\]+ ) (?P<escaped_quote> ['"] ) |
+    (?P<fstring> f )? (?P<quote> '+ | "+ )
+""", re.VERBOSE).search)
+
+
+@cython.locals(single_q=cython.Py_ssize_t, double_q=cython.Py_ssize_t,
+               hash_mark=cython.Py_ssize_t, k=cython.Py_ssize_t)
 def strip_string_literals(code, prefix='__Pyx_L'):
     """
     Normalizes every string literal to be of the form '__Pyx_Lxxx',
     returning the normalized code and a mapping of labels to
     string literals.
     """
-    new_code = []
-    literals = {}
-    counter = 0
-    start = q = 0
-    in_quote = False
-    hash_mark = single_q = double_q = -1
-    code_len = len(code)
-    quote_type = None
-    quote_len = -1
-
-    while True:
-        if hash_mark < q:
-            hash_mark = code.find('#', q)
-        if single_q < q:
-            single_q = code.find("'", q)
-        if double_q < q:
-            double_q = code.find('"', q)
-        q = min(single_q, double_q)
-        if q == -1:
-            q = max(single_q, double_q)
-
-        # We're done.
-        if q == -1 and hash_mark == -1:
-            new_code.append(code[start:])
-            break
-
-        # Try to close the quote.
-        elif in_quote:
-            if code[q-1] == '\\':
-                k = 2
-                while q >= k and code[q-k] == '\\':
-                    k += 1
-                if k % 2 == 0:
-                    q += 1
+    new_code: list = []
+    literals: dict = {}
+    counter: cython.Py_ssize_t = 0
+    find_token = _FIND_TOKEN
+
+    def new_label(literal) -> str:
+        nonlocal counter
+        counter += 1
+        label = f"{prefix}{counter}_"
+        literals[label] = literal
+        return label
+
+    def parse_string(quote_type: str, start: cython.Py_ssize_t, is_fstring: cython.bint) -> cython.Py_ssize_t:
+        charpos: cython.Py_ssize_t = start
+        end: cython.Py_ssize_t
+
+        find_token = _FIND_FSTRING_TOKEN if is_fstring else _FIND_STRING_TOKEN
+
+        while charpos != -1:
+            token = find_token(code, charpos)
+            if token is None:
+                # This probably indicates an unclosed string literal, i.e. a broken file.
+                new_code.append(new_label(code[start:]))
+                charpos = -1
+                break
+            end = token.end()
+
+            if token['escape']:
+                charpos = end
+                if len(token['escape']) % 2 == 0 and token['escaped_quote'] == quote_type[0]:
+                    # quote is not actually escaped and might be part of a terminator, look at it next
+                    charpos -= 1
+            elif is_fstring and token['braces']:
+                # formats or brace(s) in fstring
+                charpos = end
+                if len(token['braces']) % 2 == 0:
+                    # normal brace characters in string
                     continue
-            if code[q] == quote_type and (
-                    quote_len == 1 or (code_len > q + 2 and quote_type == code[q+1] == code[q+2])):
-                counter += 1
-                label = "%s%s_" % (prefix, counter)
-                literals[label] = code[start+quote_len:q]
-                full_quote = code[q:q+quote_len]
-                new_code.append(full_quote)
-                new_code.append(label)
-                new_code.append(full_quote)
-                q += quote_len
-                in_quote = False
-                start = q
-            else:
-                q += 1
-
-        # Process comment.
-        elif -1 != hash_mark and (hash_mark < q or q == -1):
-            new_code.append(code[start:hash_mark+1])
-            end = code.find('\n', hash_mark)
-            counter += 1
-            label = "%s%s_" % (prefix, counter)
-            if end == -1:
-                end_or_none = None
+                if token['braces'][-1] == '{':
+                    if start < end-1:
+                        new_code.append(new_label(code[start : end-1]))
+                    new_code.append('{')
+                    start = charpos = parse_code(end, in_fstring=True)
+            elif token['quote'].startswith(quote_type):
+                # closing quote found (potentially together with further, unrelated quotes)
+                charpos = token.start('quote')
+                if charpos > start:
+                    new_code.append(new_label(code[start : charpos]))
+                new_code.append(quote_type)
+                charpos += len(quote_type)
+                break
             else:
-                end_or_none = end
-            literals[label] = code[hash_mark+1:end_or_none]
-            new_code.append(label)
-            if end == -1:
+                # string internal quote(s)
+                charpos = end
+
+        return charpos
+
+    def parse_code(start: cython.Py_ssize_t, in_fstring: cython.bint = False) -> cython.Py_ssize_t:
+        charpos: cython.Py_ssize_t = start
+        end: cython.Py_ssize_t
+        quote: str
+
+        while charpos != -1:
+            token = find_token(code, charpos)
+            if token is None:
+                new_code.append(code[start:])
+                charpos = -1
                 break
-            start = q = end
+            charpos = end = token.end()
+
+            if token['quote']:
+                quote = token['quote']
+                if len(quote) >= 6:
+                    # ignore empty tripple-quoted strings: '''''' or """"""
+                    quote = quote[:len(quote) % 6]
+                if quote and len(quote) != 2:
+                    if len(quote) > 3:
+                        end -= len(quote) - 3
+                        quote = quote[:3]
+                    new_code.append(code[start:end])
+                    start = charpos = parse_string(quote, end, is_fstring=token['fstring'])
+
+            elif token['comment']:
+                new_code.append(code[start:end+1])
+                charpos = code.find('\n', end+1)
+                new_code.append(new_label(code[end+1 : charpos if charpos != -1 else None]))
+                if charpos == -1:
+                    break  # EOF
+                start = charpos
+
+            elif token['braces']:
+                if in_fstring and len(token['braces']) % 2 == 1:
+                    # closing '}' of f-string
+                    charpos = end = token.start() + 1
+                    new_code.append(code[start:end])  # with '}'
+                    break
 
-        # Open the quote.
-        else:
-            if code_len >= q+3 and (code[q] == code[q+1] == code[q+2]):
-                quote_len = 3
-            else:
-                quote_len = 1
-            in_quote = True
-            quote_type = code[q]
-            new_code.append(code[start:q])
-            start = q
-            q += quote_len
+        return charpos
 
+    parse_code(0)
     return "".join(new_code), literals
 
 
diff --git a/Cython/Build/Tests/TestDependencies.py b/Cython/Build/Tests/TestDependencies.py
index bc6b99bab45..e0638d38705 100644
--- a/Cython/Build/Tests/TestDependencies.py
+++ b/Cython/Build/Tests/TestDependencies.py
@@ -1,10 +1,13 @@
 import contextlib
 import os.path
+import pathlib
+import re
 import tempfile
 import unittest
 from os.path import join as pjoin
 
-from ..Dependencies import extended_iglob
+from ...Utils import open_source_file
+from ..Dependencies import extended_iglob, strip_string_literals
 
 
 @contextlib.contextmanager
@@ -131,3 +134,111 @@ def test_extended_iglob_double_star(self):
         self.files_equal("**/*.{py}", all_files[1::2])
         self.files_equal("*/*/*.py", files[1::2])
         self.files_equal("**/*.py", all_files[1::2])
+
+
+class TestCodeProcessing(unittest.TestCase):
+    maxDiff = None
+
+    @staticmethod
+    def _rebuild_string(stripped, literals):
+        def lookup(match):
+            return literals[match.group()]
+
+        return re.sub("__Pyx_L[0-9]+_", lookup, stripped)
+
+    def test_strip_string_literals(self):
+        def strip_equals(s, expected):
+            stripped, literals = strip_string_literals(s)
+            self.assertEqual(expected, stripped)
+
+            recovered = self._rebuild_string(stripped, literals)
+            self.assertEqual(s, recovered)
+
+        unchanged = [
+            """abc""",
+            """123""",
+            """func(123)""",
+            """ '' """,
+            """ '''''''''''' """,
+            """ '''''''''''''' """,
+        ]
+
+        tests = [(code, code) for code in unchanged] + [
+            (""" '''' ''' """,
+             """ '''__Pyx_L1_''' """),
+
+            (''' """" """ ''',
+             ''' """__Pyx_L1_""" '''),
+
+            (""" func('xyz') + " " + "" '' # '' | "" "123" 'xyz' "' """,
+             """ func('__Pyx_L1_') + "__Pyx_L2_" + "" '' # __Pyx_L3_"""),
+
+            (""" f'f' """,
+             """ f'__Pyx_L1_' """),
+
+            (""" f'a{123}b' """,
+             """ f'__Pyx_L1_{123}__Pyx_L2_' """),
+
+            (""" f'{1}{f'xyz'}' """,
+             """ f'{1}{f'__Pyx_L1_'}' """),
+
+            (""" f'{f'''xyz{f\"""abc\"""}'''}' """,
+             """ f'{f'''__Pyx_L1_{f\"""__Pyx_L2_\"""}'''}' """),
+
+            (""" f'{{{{{"abc"}}}}}{{}}{{' == '{{abc}}{}{' """,
+             """ f'__Pyx_L1_{"__Pyx_L2_"}__Pyx_L3_' == '__Pyx_L4_' """),
+        ]
+
+        for code, expected in tests:
+            with self.subTest(code=code):
+                strip_equals(code, expected)
+            code = code.strip()
+            expected = expected.strip()
+            with self.subTest(code=code):
+                strip_equals(code, expected)
+            code += "\n"
+            expected += "\n"
+            with self.subTest(code=code):
+                strip_equals(code, expected)
+
+        # GH5977: unclosed string literal
+        strip_equals(
+            """ print("Say something: %s' % something) """,
+            """ print("__Pyx_L1_"""
+        )
+
+
+    def _test_all_files(self, base_dir, file_paths):
+        _find_leftover_string = re.compile(r"""[^_'"}](['"]+)[^_'"{]""").search
+        for file_path in sorted(file_paths):
+            with self.subTest(file=str(file_path.relative_to(base_dir))):
+                with open_source_file(str(file_path)) as f:
+                    code = f.read()
+                stripped, literals = strip_string_literals(code)
+
+                match = _find_leftover_string(stripped)
+                if match and len(match.group(1)) != 2:
+                    match_pos = match.start() + 1
+                    self.fail(f"Leftover string found: {stripped[match_pos - 12 : match_pos + 12]!r}")
+
+                recovered = self._rebuild_string(stripped, literals)
+                self.assertEqual(code, recovered)
+
+
+    def test_strip_string_literals_py_files(self):
+        # process all .py files in the Cython package
+        package_dir = pathlib.Path(__file__).absolute().parents[2]
+        assert package_dir.name == 'Cython'
+        base_dir = package_dir.parent
+        self._test_all_files(base_dir, package_dir.rglob("*.py"))
+
+    def test_strip_string_literals_test_files(self):
+        # process all .py[x] files in the tests package
+        base_dir = pathlib.Path(__file__).absolute().parents[3]
+        tests_dir = base_dir / 'tests'
+        test_files = []
+        for test_subdir in tests_dir.iterdir():
+            if test_subdir.is_dir() and test_subdir.name != 'errors':
+                test_files.extend(test_subdir.rglob("*.py"))
+                test_files.extend(test_subdir.rglob("*.pyx"))
+        self._test_all_files(base_dir, test_files)