From 29db6ca11a3db0d04006bfe844c0c97bb179e60d Mon Sep 17 00:00:00 2001 From: abhi210 <27881020+Abhi210@users.noreply.github.com> Date: Mon, 3 Nov 2025 16:38:53 +0530 Subject: [PATCH 1/3] This PR references issue gh-140797 It adds validation to re.Scanner.init that rejects lexicon patterns containing capturing groups. If a user-supplied pattern contains any capturing groups, Scanner now raises ValueError with a clear message advising the use of non-capturing groups (?:...) instead. --- Lib/re/__init__.py | 9 ++++++++- Lib/test/test_re.py | 19 +++++++++++++++++++ ...-11-03-16-23-54.gh-issue-140797.DuFEeR.rst | 4 ++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index a5316391297f4c..54ed4d350eb3de 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -397,9 +397,16 @@ def __init__(self, lexicon, flags=0): s = _parser.State() s.flags = flags for phrase, action in lexicon: + sub_pattern = _parser.parse(phrase, flags) + if sub_pattern.state.groups != 1: # <- 1 means always has \0 + raise ValueError( + "re.Scanner lexicon patterns must not contain capturing groups;\n" + "Please use non-capturing groups (?:...) instead" + ) + gid = s.opengroup() p.append(_parser.SubPattern(s, [ - (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))), + (SUBPATTERN, (gid, 0, 0, sub_pattern)), ])) s.closegroup(gid, p[-1]) p = _parser.SubPattern(s, [(BRANCH, (None, p))]) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 5fc95087f2b6ad..dde7bd4194b17c 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1639,6 +1639,25 @@ def s_int(scanner, token): return int(token) (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 'op+', 'bar'], '')) + def test_bug_140797(self): + #bug 140797: remove capturing groups compilation form re.Scanner + + #Presence of Capturing group throws an error + lex = [("(a)b", None)] + with self.assertRaises(ValueError): + Scanner(lex) + + #Presence of non-capturing groups should pass normally + s = Scanner([("(?:a)b", lambda scanner, token: token)]) + result, rem = s.scan("ab") + self.assertEqual(result,['ab']) + self.assertEqual(rem,'') + + #Testing a very complex capturing group + pattern= "(?Pa)" + with self.assertRaises(ValueError): + Scanner([(pattern, None)]) + def test_bug_448951(self): # bug 448951 (similar to 429357, but with single char match) # (Also test greedy matches.) diff --git a/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst b/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst new file mode 100644 index 00000000000000..54ff8733e00dac --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst @@ -0,0 +1,4 @@ +The re.Scanner class now forbids regular expressions containing capturing +groups in its lexicon patterns. Patterns using capturing groups could +previously lead to crashes with segmentation fault. Use non-capturing groups +(?:...) instead. From d1d582e4c0d9395efc6aa4faa1bc3560cf4e6163 Mon Sep 17 00:00:00 2001 From: Abhishek Tiwari Date: Tue, 4 Nov 2025 15:29:22 +0530 Subject: [PATCH 2/3] Update Lib/test/test_re.py Co-authored-by: Serhiy Storchaka --- Lib/test/test_re.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index a876d2df077085..cf58821124cb52 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1640,7 +1640,7 @@ def s_int(scanner, token): return int(token) 'op+', 'bar'], '')) def test_bug_gh140797(self): - # gh140797: capturing groups is not allowed in re.Scanner + # gh140797: Capturing groups are not allowed in re.Scanner msg = "Cannot use capturing groups in re.Scanner" # Capturing group throws an error From f4f7d4db1f245d8a7574a1fbfa94af8e6392d543 Mon Sep 17 00:00:00 2001 From: Abhishek Tiwari Date: Tue, 4 Nov 2025 15:29:57 +0530 Subject: [PATCH 3/3] Update Lib/test/test_re.py Co-authored-by: Serhiy Storchaka --- Lib/test/test_re.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index cf58821124cb52..9f6f04bf6b8347 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1642,7 +1642,7 @@ def s_int(scanner, token): return int(token) def test_bug_gh140797(self): # gh140797: Capturing groups are not allowed in re.Scanner - msg = "Cannot use capturing groups in re.Scanner" + msg = r"Cannot use capturing groups in re\.Scanner" # Capturing group throws an error with self.assertRaisesRegex(ValueError, msg): Scanner([("(a)b", None)])