From 345b390ed69f36681dbc41187bc8f49cd9135b54 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 21 Mar 2022 18:28:22 +0200 Subject: [PATCH] bpo-433030: Add support of atomic grouping in regular expressions (GH-31982) * Atomic grouping: (?>...). * Possessive quantifiers: x++, x*+, x?+, x{m,n}+. Equivalent to (?>x+), (?>x*), (?>x?), (?>x{m,n}). Co-authored-by: Jeffrey C. Jacobs --- Doc/library/re.rst | 54 ++++ Doc/whatsnew/3.11.rst | 6 + Lib/sre_compile.py | 38 ++- Lib/sre_constants.py | 5 +- Lib/sre_parse.py | 32 +- Lib/test/test_re.py | 294 ++++++++++++++---- Misc/ACKS | 1 + .../2022-03-19-08-42-57.bpo-433030.UTwRX7.rst | 2 + Modules/_sre.c | 24 +- Modules/sre_constants.h | 31 +- Modules/sre_lib.h | 198 ++++++++++++ 11 files changed, 593 insertions(+), 92 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 950a5b1fdc240f..02c0a849610b0c 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -154,6 +154,30 @@ The special characters are: characters as possible will be matched. Using the RE ``<.*?>`` will match only ``''``. +.. index:: + single: *+; in regular expressions + single: ++; in regular expressions + single: ?+; in regular expressions + +``*+``, ``++``, ``?+`` + Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is + appended also match as many times as possible. + However, unlike the true greedy qualifiers, these do not allow + back-tracking when the expression following it fails to match. + These are known as :dfn:`possessive` qualifiers. + For example, ``a*a`` will match ``'aaaa'`` because the ``a*`` will match + all 4 ``'a'``s, but, when the final ``'a'`` is encountered, the + expression is backtracked so that in the end the ``a*`` ends up matching + 3 ``'a'``s total, and the fourth ``'a'`` is matched by the final ``'a'``. + However, when ``a*+a`` is used to match ``'aaaa'``, the ``a*+`` will + match all 4 ``'a'``, but when the final ``'a'`` fails to find any more + characters to match, the expression cannot be backtracked and will thus + fail to match. + ``x*+``, ``x++`` and ``x?+`` are equivalent to ``(?>x*)``, ``(?>x+)`` + and ``(?>x?)`` correspondigly. + + .. versionadded:: 3.11 + .. index:: single: {} (curly brackets); in regular expressions @@ -178,6 +202,21 @@ The special characters are: 6-character string ``'aaaaaa'``, ``a{3,5}`` will match 5 ``'a'`` characters, while ``a{3,5}?`` will only match 3 characters. +``{m,n}+`` + Causes the resulting RE to match from *m* to *n* repetitions of the + preceding RE, attempting to match as many repetitions as possible + *without* establishing any backtracking points. + This is the possessive version of the qualifier above. + For example, on the 6-character string ``'aaaaaa'``, ``a{3,5}+aa`` + attempt to match 5 ``'a'`` characters, then, requiring 2 more ``'a'``s, + will need more characters than available and thus fail, while + ``a{3,5}aa`` will match with ``a{3,5}`` capturing 5, then 4 ``'a'``s + by backtracking and then the final 2 ``'a'``s are matched by the final + ``aa`` in the pattern. + ``x{m,n}+`` is equivalent to ``(?>x{m,n})``. + + .. versionadded:: 3.11 + .. index:: single: \ (backslash); in regular expressions ``\`` @@ -336,6 +375,21 @@ The special characters are: .. versionchanged:: 3.7 The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group. +``(?>...)`` + Attempts to match ``...`` as if it was a separate regular expression, and + if successful, continues to match the rest of the pattern following it. + If the subsequent pattern fails to match, the stack can only be unwound + to a point *before* the ``(?>...)`` because once exited, the expression, + known as an :dfn:`atomic group`, has thrown away all stack points within + itself. + Thus, ``(?>.*).`` would never match anything because first the ``.*`` + would match all characters possible, then, having nothing left to match, + the final ``.`` would fail to match. + Since there are no stack points saved in the Atomic Group, and there is + no stack point before it, the entire expression would thus fail to match. + + .. versionadded:: 3.11 + .. index:: single: (?P<; in regular expressions ``(?P...)`` diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index b7e9dc6e9e37f8..ca7699daa98542 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -295,6 +295,12 @@ os instead of ``CryptGenRandom()`` which is deprecated. (Contributed by Dong-hee Na in :issue:`44611`.) +re +-- + +* Atomic grouping (``(?>...)``) and possessive qualifiers (``*+``, ``++``, + ``?+``, ``{m,n}+``) are now supported in regular expressions. + (Contributed by Jeffrey C. Jacobs and Serhiy Storchaka in :issue:`433030`.) shutil ------ diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index c6398bfb83a576..0867200a59a230 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -17,11 +17,16 @@ assert _sre.MAGIC == MAGIC, "SRE module mismatch" _LITERAL_CODES = {LITERAL, NOT_LITERAL} -_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} _UNIT_CODES = _LITERAL_CODES | {ANY, IN} +_REPEATING_CODES = { + MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), + MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), + POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), +} + # Sets of lowercase characters which have the same uppercase. _equivalences = ( # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I @@ -138,10 +143,7 @@ def _compile(code, pattern, flags): if flags & SRE_FLAG_TEMPLATE: raise error("internal: unsupported template operator %r" % (op,)) if _simple(av[2]): - if op is MAX_REPEAT: - emit(REPEAT_ONE) - else: - emit(MIN_REPEAT_ONE) + emit(REPEATING_CODES[op][2]) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) @@ -149,16 +151,13 @@ def _compile(code, pattern, flags): emit(SUCCESS) code[skip] = _len(code) - skip else: - emit(REPEAT) + emit(REPEATING_CODES[op][0]) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) code[skip] = _len(code) - skip - if op is MAX_REPEAT: - emit(MAX_UNTIL) - else: - emit(MIN_UNTIL) + emit(REPEATING_CODES[op][1]) elif op is SUBPATTERN: group, add_flags, del_flags, p = av if group: @@ -169,6 +168,17 @@ def _compile(code, pattern, flags): if group: emit(MARK) emit((group-1)*2+1) + elif op is ATOMIC_GROUP: + # Atomic Groups are handled by starting with an Atomic + # Group op code, then putting in the atomic group pattern + # and finally a success op code to tell any repeat + # operations within the Atomic Group to stop eating and + # pop their stack if they reach it + emit(ATOMIC_GROUP) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(SUCCESS) + code[skip] = _len(code) - skip elif op in SUCCESS_CODES: emit(op) elif op in ASSERT_CODES: @@ -709,7 +719,8 @@ def print_2(*args): else: print_(FAILURE) i += 1 - elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE): + elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, + POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): skip, min, max = code[i: i+3] if max == MAXREPEAT: max = 'MAXREPEAT' @@ -725,6 +736,11 @@ def print_2(*args): print_(op, skip, arg, to=i+skip) dis_(i+2, i+skip) i += skip + elif op is ATOMIC_GROUP: + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip elif op is INFO: skip, flags, min, max = code[i: i+4] if max == MAXREPEAT: diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 8e613cb3fa5dcb..a00b0170607b59 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20171005 +MAGIC = 20220318 from _sre import MAXREPEAT, MAXGROUPS @@ -97,6 +97,9 @@ def _makecodes(names): REPEAT_ONE SUBPATTERN MIN_REPEAT_ONE + ATOMIC_GROUP + POSSESSIVE_REPEAT + POSSESSIVE_REPEAT_ONE GROUPREF_IGNORE IN_IGNORE diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index bb951073938706..b91082e48fafe1 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -25,7 +25,7 @@ WHITESPACE = frozenset(" \t\n\r\v\f") -_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) +_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) ESCAPES = { @@ -190,6 +190,10 @@ def getwidth(self): i, j = av.getwidth() lo = lo + i hi = hi + j + elif op is ATOMIC_GROUP: + i, j = av.getwidth() + lo = lo + i + hi = hi + j elif op is SUBPATTERN: i, j = av[-1].getwidth() lo = lo + i @@ -675,8 +679,13 @@ def _parse(source, state, verbose, nested, first=False): if group is None and not add_flags and not del_flags: item = p if sourcematch("?"): + # Non-Greedy Match subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) else: + # Greedy Match subpattern[-1] = (MAX_REPEAT, (min, max, item)) elif this == ".": @@ -684,7 +693,8 @@ def _parse(source, state, verbose, nested, first=False): elif this == "(": start = source.tell() - 1 - group = True + capture = True + atomic = False name = None add_flags = 0 del_flags = 0 @@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False): len(char) + 2) elif char == ":": # non-capturing group - group = None + capture = False elif char == "#": # comment while True: @@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False): subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) continue + elif char == ">": + # non-capturing, atomic group + capture = False + atomic = True elif char in FLAGS or char == "-": # flags flags = _parse_flags(source, state, char) @@ -813,17 +827,19 @@ def _parse(source, state, verbose, nested, first=False): continue add_flags, del_flags = flags - group = None + capture = False else: raise source.error("unknown extension ?" + char, len(char) + 1) # parse group contents - if group is not None: + if capture: try: group = state.opengroup(name) except error as err: raise source.error(err.msg, len(name) + 1) from None + else: + group = None sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and not (del_flags & SRE_FLAG_VERBOSE)) p = _parse_sub(source, state, sub_verbose, nested + 1) @@ -832,7 +848,11 @@ def _parse(source, state, verbose, nested, first=False): source.tell() - start) if group is not None: state.closegroup(group, p) - subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) + if atomic: + assert group is None + subpatternappend((ATOMIC_GROUP, p)) + else: + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) elif this == "^": subpatternappend((AT, AT_BEGINNING)) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index f8bbe5178219ac..bde7509f09729f 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -83,6 +83,23 @@ def test_search_star_plus(self): self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) self.assertIsNone(re.match('a+', 'xxx')) + def test_branching(self): + """Test Branching + Test expressions using the OR ('|') operator.""" + self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2)) + self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(), + (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(), + (0, 2)) + self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1)) + def bump_num(self, matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1) @@ -1239,11 +1256,13 @@ def test_nothing_to_repeat(self): 'nothing to repeat', 3) def test_multiple_repeat(self): - for outer_reps in '*', '+', '{1,2}': - for outer_mod in '', '?': + for outer_reps in '*', '+', '?', '{1,2}': + for outer_mod in '', '?', '+': outer_op = outer_reps + outer_mod for inner_reps in '*', '+', '?', '{1,2}': - for inner_mod in '', '?': + for inner_mod in '', '?', '+': + if inner_mod + outer_reps in ('?', '+'): + continue inner_op = inner_reps + inner_mod self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 'multiple repeat', 1 + len(inner_op)) @@ -1458,7 +1477,8 @@ def test_inline_flags(self): def test_dollar_matches_twice(self): - "$ matches the end of string, and just before the terminating \n" + r"""Test that $ does not include \n + $ matches the end of string, and just before the terminating \n""" pattern = re.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') @@ -1774,60 +1794,6 @@ def test_bug_2537(self): self.assertEqual(m.group(1), "") self.assertEqual(m.group(2), "y") - @cpython_only - def test_debug_flag(self): - pat = r'(\.)(?:[ch]|py)(?(1)$|: )' - with captured_stdout() as out: - re.compile(pat, re.DEBUG) - self.maxDiff = None - dump = '''\ -SUBPATTERN 1 0 0 - LITERAL 46 -BRANCH - IN - LITERAL 99 - LITERAL 104 -OR - LITERAL 112 - LITERAL 121 -GROUPREF_EXISTS 1 - AT AT_END -ELSE - LITERAL 58 - LITERAL 32 - - 0. INFO 8 0b1 2 5 (to 9) - prefix_skip 0 - prefix [0x2e] ('.') - overlap [0] - 9: MARK 0 -11. LITERAL 0x2e ('.') -13. MARK 1 -15. BRANCH 10 (to 26) -17. IN 6 (to 24) -19. LITERAL 0x63 ('c') -21. LITERAL 0x68 ('h') -23. FAILURE -24: JUMP 9 (to 34) -26: branch 7 (to 33) -27. LITERAL 0x70 ('p') -29. LITERAL 0x79 ('y') -31. JUMP 2 (to 34) -33: FAILURE -34: GROUPREF_EXISTS 0 6 (to 41) -37. AT END -39. JUMP 5 (to 45) -41: LITERAL 0x3a (':') -43. LITERAL 0x20 (' ') -45: SUCCESS -''' - self.assertEqual(out.getvalue(), dump) - # Debug output is output again even a second time (bypassing - # the cache -- issue #20426). - with captured_stdout() as out: - re.compile(pat, re.DEBUG) - self.assertEqual(out.getvalue(), dump) - def test_keyword_parameters(self): # Issue #20283: Accepting the string keyword parameter. pat = re.compile(r'(ab)') @@ -2072,6 +2038,218 @@ def test_bug_40736(self): with self.assertRaisesRegex(TypeError, "got 'type'"): re.search("x*", type) + def test_possessive_qualifiers(self): + """Test Possessive Qualifiers + Test qualifiers of the form @+ for some repetition operator @, + e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed + without creating a stack frame for rolling the stack back and + trying 1 or more fewer matches.""" + self.assertIsNone(re.match('e*+e', 'eeee')) + self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea') + self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea') + self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea') + self.assertIsNone(re.match('(.)++.', 'ee')) + self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',)) + self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(), + ('ae',)) + self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(), + ('',)) + self.assertEqual(re.match('()*+a', 'a').groups(), ('',)) + self.assertEqual(re.search('x*+', 'axx').span(), (0, 0)) + self.assertEqual(re.search('x++', 'axx').span(), (1, 3)) + self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0)) + self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3)) + self.assertIsNone(re.match('a++', 'xxx')) + self.assertIsNone(re.match(r"^(\w){1}+$", "abc")) + self.assertIsNone(re.match(r"^(\w){1,2}+$", "abc")) + + self.assertEqual(re.match(r"^(\w){3}+$", "abc").group(1), "c") + self.assertEqual(re.match(r"^(\w){1,3}+$", "abc").group(1), "c") + self.assertEqual(re.match(r"^(\w){1,4}+$", "abc").group(1), "c") + + self.assertIsNone(re.match("^x{1}+$", "xxx")) + self.assertIsNone(re.match("^x{1,2}+$", "xxx")) + + self.assertTrue(re.match("^x{3}+$", "xxx")) + self.assertTrue(re.match("^x{1,3}+$", "xxx")) + self.assertTrue(re.match("^x{1,4}+$", "xxx")) + + self.assertIsNone(re.match("^x{}+$", "xxx")) + self.assertTrue(re.match("^x{}+$", "x{}")) + + def test_fullmatch_possessive_qualifiers(self): + self.assertTrue(re.fullmatch(r'a++', 'a')) + self.assertTrue(re.fullmatch(r'a*+', 'a')) + self.assertTrue(re.fullmatch(r'a?+', 'a')) + self.assertTrue(re.fullmatch(r'a{1,3}+', 'a')) + self.assertIsNone(re.fullmatch(r'a++', 'ab')) + self.assertIsNone(re.fullmatch(r'a*+', 'ab')) + self.assertIsNone(re.fullmatch(r'a?+', 'ab')) + self.assertIsNone(re.fullmatch(r'a{1,3}+', 'ab')) + + self.assertTrue(re.fullmatch(r'(?:ab)++', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab)*+', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab)?+', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab){1,3}+', 'ab')) + self.assertIsNone(re.fullmatch(r'(?:ab)++', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab)*+', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab)?+', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab){1,3}+', 'abc')) + + def test_findall_possessive_qualifiers(self): + self.assertEqual(re.findall(r'a++', 'aab'), ['aa']) + self.assertEqual(re.findall(r'a*+', 'aab'), ['aa', '', '']) + self.assertEqual(re.findall(r'a?+', 'aab'), ['a', 'a', '', '']) + self.assertEqual(re.findall(r'a{1,3}+', 'aab'), ['aa']) + + self.assertEqual(re.findall(r'(?:ab)++', 'ababc'), ['abab']) + self.assertEqual(re.findall(r'(?:ab)*+', 'ababc'), ['abab', '', '']) + self.assertEqual(re.findall(r'(?:ab)?+', 'ababc'), ['ab', 'ab', '', '']) + self.assertEqual(re.findall(r'(?:ab){1,3}+', 'ababc'), ['abab']) + + def test_atomic_grouping(self): + """Test Atomic Grouping + Test non-capturing groups of the form (?>...), which does + not maintain any stack point created within the group once the + group is finished being evaluated.""" + pattern1 = re.compile(r'a(?>bc|b)c') + self.assertIsNone(pattern1.match('abc')) + self.assertTrue(pattern1.match('abcc')) + self.assertIsNone(re.match(r'(?>.*).', 'abc')) + self.assertTrue(re.match(r'(?>x)++', 'xxx')) + self.assertTrue(re.match(r'(?>x++)', 'xxx')) + self.assertIsNone(re.match(r'(?>x)++x', 'xxx')) + self.assertIsNone(re.match(r'(?>x++)x', 'xxx')) + + def test_fullmatch_atomic_grouping(self): + self.assertTrue(re.fullmatch(r'(?>a+)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a*)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a?)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a{1,3})', 'a')) + self.assertIsNone(re.fullmatch(r'(?>a+)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a*)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a?)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a{1,3})', 'ab')) + + self.assertTrue(re.fullmatch(r'(?>(?:ab)+)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)*)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)?)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)+)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)*)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)?)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab){1,3})', 'abc')) + + def test_findall_atomic_grouping(self): + self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa']) + self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', '']) + self.assertEqual(re.findall(r'(?>a?)', 'aab'), ['a', 'a', '', '']) + self.assertEqual(re.findall(r'(?>a{1,3})', 'aab'), ['aa']) + + self.assertEqual(re.findall(r'(?>(?:ab)+)', 'ababc'), ['abab']) + self.assertEqual(re.findall(r'(?>(?:ab)*)', 'ababc'), ['abab', '', '']) + self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', '']) + self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab']) + + +def get_debug_out(pat): + with captured_stdout() as out: + re.compile(pat, re.DEBUG) + return out.getvalue() + + +@cpython_only +class DebugTests(unittest.TestCase): + maxDiff = None + + def test_debug_flag(self): + pat = r'(\.)(?:[ch]|py)(?(1)$|: )' + dump = '''\ +SUBPATTERN 1 0 0 + LITERAL 46 +BRANCH + IN + LITERAL 99 + LITERAL 104 +OR + LITERAL 112 + LITERAL 121 +GROUPREF_EXISTS 1 + AT AT_END +ELSE + LITERAL 58 + LITERAL 32 + + 0. INFO 8 0b1 2 5 (to 9) + prefix_skip 0 + prefix [0x2e] ('.') + overlap [0] + 9: MARK 0 +11. LITERAL 0x2e ('.') +13. MARK 1 +15. BRANCH 10 (to 26) +17. IN 6 (to 24) +19. LITERAL 0x63 ('c') +21. LITERAL 0x68 ('h') +23. FAILURE +24: JUMP 9 (to 34) +26: branch 7 (to 33) +27. LITERAL 0x70 ('p') +29. LITERAL 0x79 ('y') +31. JUMP 2 (to 34) +33: FAILURE +34: GROUPREF_EXISTS 0 6 (to 41) +37. AT END +39. JUMP 5 (to 45) +41: LITERAL 0x3a (':') +43. LITERAL 0x20 (' ') +45: SUCCESS +''' + self.assertEqual(get_debug_out(pat), dump) + # Debug output is output again even a second time (bypassing + # the cache -- issue #20426). + self.assertEqual(get_debug_out(pat), dump) + + def test_atomic_group(self): + self.assertEqual(get_debug_out(r'(?>ab?)'), '''\ +ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))] + + 0. INFO 4 0b0 1 2 (to 5) + 5: ATOMIC_GROUP 11 (to 17) + 7. LITERAL 0x61 ('a') + 9. REPEAT_ONE 6 0 1 (to 16) +13. LITERAL 0x62 ('b') +15. SUCCESS +16: SUCCESS +17: SUCCESS +''') + + def test_possesive_repeat_one(self): + self.assertEqual(get_debug_out(r'a?+'), '''\ +POSSESSIVE_REPEAT 0 1 + LITERAL 97 + + 0. INFO 4 0b0 0 1 (to 5) + 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12) + 9. LITERAL 0x61 ('a') +11. SUCCESS +12: SUCCESS +''') + + def test_possesive_repeat(self): + self.assertEqual(get_debug_out(r'(?:ab)?+'), '''\ +POSSESSIVE_REPEAT 0 1 + LITERAL 97 + LITERAL 98 + + 0. INFO 4 0b0 0 2 (to 5) + 5: POSSESSIVE_REPEAT 7 0 1 (to 13) + 9. LITERAL 0x61 ('a') +11. LITERAL 0x62 ('b') +13: SUCCESS +14. SUCCESS +''') + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Misc/ACKS b/Misc/ACKS index 895813e8fc208a..cfc15be2c9748f 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -824,6 +824,7 @@ Ben Jackson Paul Jackson Manuel Jacob David Jacobs +Jeffrey C. Jacobs Kevin Jacobs Kjetil Jacobsen Shantanu Jain diff --git a/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst new file mode 100644 index 00000000000000..1afed73ffeab23 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst @@ -0,0 +1,2 @@ +Add support of atomic grouping (``(?>...)``) and possessive qualifiers +(``*+``, ``++``, ``?+``, ``{m,n}+``) in :mod:`regular expressions `. diff --git a/Modules/_sre.c b/Modules/_sre.c index ab321eafb6d792..35bdb4f3d2202b 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1807,6 +1807,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_REPEAT_ONE: case SRE_OP_MIN_REPEAT_ONE: + case SRE_OP_POSSESSIVE_REPEAT_ONE: { SRE_CODE min, max; GET_SKIP; @@ -1826,8 +1827,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) break; case SRE_OP_REPEAT: + case SRE_OP_POSSESSIVE_REPEAT: { - SRE_CODE min, max; + SRE_CODE op1 = op, min, max; GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; @@ -1839,7 +1841,25 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) FAIL; code += skip-3; GET_OP; - if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) + if (op1 == SRE_OP_POSSESSIVE_REPEAT) { + if (op != SRE_OP_SUCCESS) + FAIL; + } + else { + if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) + FAIL; + } + } + break; + + case SRE_OP_ATOMIC_GROUP: + { + GET_SKIP; + if (!_validate_inner(code, code+skip-2, groups)) + FAIL; + code += skip-2; + GET_OP; + if (op != SRE_OP_SUCCESS) FAIL; } break; diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index c8ccb32d21de6c..8b9125b75b4568 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20171005 +#define SRE_MAGIC 20220318 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -40,19 +40,22 @@ #define SRE_OP_REPEAT_ONE 25 #define SRE_OP_SUBPATTERN 26 #define SRE_OP_MIN_REPEAT_ONE 27 -#define SRE_OP_GROUPREF_IGNORE 28 -#define SRE_OP_IN_IGNORE 29 -#define SRE_OP_LITERAL_IGNORE 30 -#define SRE_OP_NOT_LITERAL_IGNORE 31 -#define SRE_OP_GROUPREF_LOC_IGNORE 32 -#define SRE_OP_IN_LOC_IGNORE 33 -#define SRE_OP_LITERAL_LOC_IGNORE 34 -#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35 -#define SRE_OP_GROUPREF_UNI_IGNORE 36 -#define SRE_OP_IN_UNI_IGNORE 37 -#define SRE_OP_LITERAL_UNI_IGNORE 38 -#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39 -#define SRE_OP_RANGE_UNI_IGNORE 40 +#define SRE_OP_ATOMIC_GROUP 28 +#define SRE_OP_POSSESSIVE_REPEAT 29 +#define SRE_OP_POSSESSIVE_REPEAT_ONE 30 +#define SRE_OP_GROUPREF_IGNORE 31 +#define SRE_OP_IN_IGNORE 32 +#define SRE_OP_LITERAL_IGNORE 33 +#define SRE_OP_NOT_LITERAL_IGNORE 34 +#define SRE_OP_GROUPREF_LOC_IGNORE 35 +#define SRE_OP_IN_LOC_IGNORE 36 +#define SRE_OP_LITERAL_LOC_IGNORE 37 +#define SRE_OP_NOT_LITERAL_LOC_IGNORE 38 +#define SRE_OP_GROUPREF_UNI_IGNORE 39 +#define SRE_OP_IN_UNI_IGNORE 40 +#define SRE_OP_LITERAL_UNI_IGNORE 41 +#define SRE_OP_NOT_LITERAL_UNI_IGNORE 42 +#define SRE_OP_RANGE_UNI_IGNORE 43 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 32469cd161cf3b..956fd3fad91649 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -480,6 +480,9 @@ do { \ #define JUMP_BRANCH 11 #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 +#define JUMP_POSS_REPEAT_1 14 +#define JUMP_POSS_REPEAT_2 15 +#define JUMP_ATOMIC_GROUP 16 #define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \ DATA_ALLOC(SRE(match_context), nextctx); \ @@ -950,6 +953,60 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) } RETURN_FAILURE; + case SRE_OP_POSSESSIVE_REPEAT_ONE: + /* match repeated sequence (maximizing regexp) without + backtracking */ + + /* this operator only works if the repeated item is + exactly one character wide, and we're not already + collecting backtracking points. for other cases, + use the MAX_REPEAT operator */ + + /* <1=min> <2=max> item + tail */ + + TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + + if (ctx->ptr + ctx->pattern[1] > end) { + RETURN_FAILURE; /* cannot match */ + } + + state->ptr = ctx->ptr; + + ret = SRE(count)(state, ctx->pattern + 3, ctx->pattern[2]); + RETURN_ON_ERROR(ret); + DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); + ctx->count = ret; + ctx->ptr += ctx->count; + + /* when we arrive here, count contains the number of + matches, and ctx->ptr points to the tail of the target + string. check if the rest of the pattern matches, + and fail if not. */ + + /* Test for not enough repetitions in match */ + if (ctx->count < (Py_ssize_t) ctx->pattern[1]) { + RETURN_FAILURE; + } + + /* Update the pattern to point to the next op code */ + ctx->pattern += ctx->pattern[0]; + + /* Let the tail be evaluated separately and consider this + match successful. */ + if (*ctx->pattern == SRE_OP_SUCCESS && + ctx->ptr == state->end && + !(ctx->toplevel && state->must_advance && ctx->ptr == state->start)) + { + /* tail is empty. we're finished */ + state->ptr = ctx->ptr; + RETURN_SUCCESS; + } + + /* Attempt to match the rest of the string */ + break; + case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ @@ -1110,6 +1167,138 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; RETURN_FAILURE; + case SRE_OP_POSSESSIVE_REPEAT: + /* create possessive repeat contexts. */ + /* <1=min> <2=max> pattern + tail */ + TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + + /* Set the global Input pointer to this context's Input + pointer */ + state->ptr = ctx->ptr; + + /* Initialize Count to 0 */ + ctx->count = 0; + + /* Check for minimum required matches. */ + while (ctx->count < (Py_ssize_t)ctx->pattern[1]) { + /* not enough matches */ + DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1, + &ctx->pattern[3]); + if (ret) { + RETURN_ON_ERROR(ret); + ctx->count++; + } + else { + state->ptr = ctx->ptr; + RETURN_FAILURE; + } + } + + /* Clear the context's Input stream pointer so that it + doesn't match the global state so that the while loop can + be entered. */ + ctx->ptr = NULL; + + /* Keep trying to parse the sub-pattern until the + end is reached, creating a new context each time. */ + while ((ctx->count < (Py_ssize_t)ctx->pattern[2] || + (Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT) && + state->ptr != ctx->ptr) { + /* Save the Capture Group Marker state into the current + Context and back up the current highest number + Capture Group marker. */ + LASTMARK_SAVE(); + MARK_PUSH(ctx->lastmark); + + /* zero-width match protection */ + /* Set the context's Input Stream pointer to be the + current Input Stream pointer from the global + state. When the loop reaches the next iteration, + the context will then store the last known good + position with the global state holding the Input + Input Stream position that has been updated with + the most recent match. Thus, if state's Input + stream remains the same as the one stored in the + current Context, we know we have successfully + matched an empty string and that all subsequent + matches will also be the empty string until the + maximum number of matches are counted, and because + of this, we could immediately stop at that point and + consider this match successful. */ + ctx->ptr = state->ptr; + + /* We have not reached the maximin matches, so try to + match once more. */ + DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2, + &ctx->pattern[3]); + + /* Check to see if the last attempted match + succeeded. */ + if (ret) { + /* Drop the saved highest number Capture Group + marker saved above and use the newly updated + value. */ + MARK_POP_DISCARD(ctx->lastmark); + RETURN_ON_ERROR(ret); + + /* Success, increment the count. */ + ctx->count++; + } + /* Last attempted match failed. */ + else { + /* Restore the previously saved highest number + Capture Group marker since the last iteration + did not match, then restore that to the global + state. */ + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); + + /* We have sufficient matches, so exit loop. */ + break; + } + } + + /* Evaluate Tail */ + /* Jump to end of pattern indicated by skip, and then skip + the SUCCESS op code that follows it. */ + ctx->pattern += ctx->pattern[0] + 1; + ctx->ptr = state->ptr; + break; + + case SRE_OP_ATOMIC_GROUP: + /* Atomic Group Sub Pattern */ + /* pattern tail */ + TRACE(("|%p|%p|ATOMIC_GROUP\n", ctx->pattern, ctx->ptr)); + + /* Set the global Input pointer to this context's Input + pointer */ + state->ptr = ctx->ptr; + + /* Evaluate the Atomic Group in a new context, terminating + when the end of the group, represented by a SUCCESS op + code, is reached. */ + /* Group Pattern begins at an offset of 1 code. */ + DO_JUMP(JUMP_ATOMIC_GROUP, jump_atomic_group, + &ctx->pattern[1]); + + /* Test Exit Condition */ + RETURN_ON_ERROR(ret); + + if (ret == 0) { + /* Atomic Group failed to Match. */ + state->ptr = ctx->ptr; + RETURN_FAILURE; + } + + /* Evaluate Tail */ + /* Jump to end of pattern indicated by skip, and then skip + the SUCCESS op code that follows it. */ + ctx->pattern += ctx->pattern[0]; + ctx->ptr = state->ptr; + break; + case SRE_OP_GROUPREF: /* match backreference */ TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, @@ -1306,6 +1495,12 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case JUMP_MIN_UNTIL_1: TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); goto jump_min_until_1; + case JUMP_POSS_REPEAT_1: + TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", ctx->pattern, ctx->ptr)); + goto jump_poss_repeat_1; + case JUMP_POSS_REPEAT_2: + TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", ctx->pattern, ctx->ptr)); + goto jump_poss_repeat_2; case JUMP_REPEAT: TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); goto jump_repeat; @@ -1318,6 +1513,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case JUMP_MIN_REPEAT_ONE: TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); goto jump_min_repeat_one; + case JUMP_ATOMIC_GROUP: + TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", ctx->pattern, ctx->ptr)); + goto jump_atomic_group; case JUMP_ASSERT: TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); goto jump_assert;