From f564c13eca49782b3335a9df578c6f29a3361e28 Mon Sep 17 00:00:00 2001 From: Abhishek Tiwari Date: Tue, 4 Nov 2025 16:24:28 +0530 Subject: [PATCH] gh-140797: Forbid capturing groups in re.Scanner lexicon patterns (GH-140944) (cherry picked from commit fa9c3eefd475f0647a69bf3f49db8100848fb6a9) Co-authored-by: Abhishek Tiwari --- Lib/re/__init__.py | 5 ++++- Lib/test/test_re.py | 18 ++++++++++++++++++ ...5-11-03-16-23-54.gh-issue-140797.DuFEeR.rst | 2 ++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index 7e8abbf6ffe155..5a821411ad6e50 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -399,9 +399,12 @@ def __init__(self, lexicon, flags=0): s = _parser.State() s.flags = flags for phrase, action in lexicon: + sub_pattern = _parser.parse(phrase, flags) + if sub_pattern.state.groups != 1: + raise ValueError("Cannot use capturing groups in re.Scanner") gid = s.opengroup() p.append(_parser.SubPattern(s, [ - (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))), + (SUBPATTERN, (gid, 0, 0, sub_pattern)), ])) s.closegroup(gid, p[-1]) p = _parser.SubPattern(s, [(BRANCH, (None, p))]) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 813cb4a3f54f23..8d39e7f2086666 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1638,6 +1638,24 @@ def s_int(scanner, token): return int(token) (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 'op+', 'bar'], '')) + def test_bug_gh140797(self): + # gh140797: Capturing groups are not allowed in re.Scanner + + msg = r"Cannot use capturing groups in re\.Scanner" + # Capturing group throws an error + with self.assertRaisesRegex(ValueError, msg): + Scanner([("(a)b", None)]) + + # Named Group + with self.assertRaisesRegex(ValueError, msg): + Scanner([("(?Pa)", None)]) + + # Non-capturing groups should pass normally + s = Scanner([("(?:a)b", lambda scanner, token: token)]) + result, rem = s.scan("ab") + self.assertEqual(result,['ab']) + self.assertEqual(rem,'') + def test_bug_448951(self): # bug 448951 (similar to 429357, but with single char match) # (Also test greedy matches.) diff --git a/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst b/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst new file mode 100644 index 00000000000000..493b740261e64c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst @@ -0,0 +1,2 @@ +The undocumented :class:`!re.Scanner` class now forbids regular expressions containing capturing groups in its lexicon patterns. Patterns using capturing groups could +previously lead to crashes with segmentation fault. Use non-capturing groups (?:...) instead.