python · serhiy-storchaka · Mar 21, 2022 · Mar 18, 2022 · Mar 19, 2022 · Mar 19, 2022
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
@@ -154,6 +154,30 @@ The special characters are:
    characters as possible will be matched.  Using the RE ``<.*?>`` will match
    only ``'<a>'``.
 
+.. index::
+   single: *+; in regular expressions
+   single: ++; in regular expressions
+   single: ?+; in regular expressions
+
+``*+``, ``++``, ``?+``
+  Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is
+  appended also match as many times as possible.
+  However, unlike the true greedy qualifiers, these do not allow
+  back-tracking when the expression following it fails to match.
+  These are known as :dfn:`possessive` qualifiers.
+  For example, ``a*a`` will match ``'aaaa'`` because the ``a*`` will match
+  all 4 ``'a'``s, but, when the final ``'a'`` is encountered, the
+  expression is backtracked so that in the end the ``a*`` ends up matching
+  3 ``'a'``s total, and the fourth ``'a'`` is matched by the final ``'a'``.
+  However, when ``a*+a`` is used to match ``'aaaa'``, the ``a*+`` will
+  match all 4 ``'a'``, but when the final ``'a'`` fails to find any more
+  characters to match, the expression cannot be backtracked and will thus
+  fail to match.
+  ``x*+``, ``x++`` and ``x?+`` are equivalent to ``(?>x*)``, ``(?>x+)``
+  and ``(?>x?)`` correspondigly.
+
+   .. versionadded:: 3.11
+
 .. index::
    single: {} (curly brackets); in regular expressions
 
@@ -178,6 +202,21 @@ The special characters are:
    6-character string ``'aaaaaa'``, ``a{3,5}`` will match 5 ``'a'`` characters,
    while ``a{3,5}?`` will only match 3 characters.
 
+``{m,n}+``
+   Causes the resulting RE to match from *m* to *n* repetitions of the
+   preceding RE, attempting to match as many repetitions as possible
+   *without* establishing any backtracking points.
+   This is the possessive version of the qualifier above.
+   For example, on the 6-character string ``'aaaaaa'``, ``a{3,5}+aa``
+   attempt to match 5 ``'a'`` characters, then, requiring 2 more ``'a'``s,
+   will need more characters than available and thus fail, while
+   ``a{3,5}aa`` will match with ``a{3,5}`` capturing 5, then 4 ``'a'``s
+   by backtracking and then the final 2 ``'a'``s are matched by the final
+   ``aa`` in the pattern.
+   ``x{m,n}+`` is equivalent to ``(?>x{m,n})``.
+
+   .. versionadded:: 3.11
+
 .. index:: single: \ (backslash); in regular expressions
 
 ``\``
@@ -333,6 +372,21 @@ The special characters are:
    .. versionchanged:: 3.7
       The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group.
 
+``(?>...)``
+   Attempts to match ``...`` as if it was a separate regular expression, and
+   if successful, continues to match the rest of the pattern following it.
+   If the subsequent pattern fails to match, the stack can only be unwound
+   to a point *before* the ``(?>...)`` because once exited, the expression,
+   known as an :dfn:`atomic group`, has thrown away all stack points within
+   itself.
+   Thus, ``(?>.*).`` would never match anything because first the ``.*``
+   would match all characters possible, then, having nothing left to match,
+   the final ``.`` would fail to match.
+   Since there are no stack points saved in the Atomic Group, and there is
+   no stack point before it, the entire expression would thus fail to match.
+
+   .. versionadded:: 3.11
+
 .. index:: single: (?P<; in regular expressions
 
 ``(?P<name>...)``

diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
@@ -295,6 +295,13 @@ os
   instead of ``CryptGenRandom()`` which is deprecated.
   (Contributed by Dong-hee Na in :issue:`44611`.)
 
+re
+--
+
+* :term:`Atomic grouping <atomic group>` (``(?>...)``) and :term:`possessive`
+  qualifiers (``*+``, ``++``, ``?+``, ``{m,n}+``) are now supported in regular
+  expressions.
+  (Contributed by Jeffrey C. Jacobs and Serhiy Storchaka in :issue:`433030`.)
 
 shutil
 ------

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
@@ -17,7 +17,7 @@
 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
 
 _LITERAL_CODES = {LITERAL, NOT_LITERAL}
-_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
+_REPEATING_CODES = {MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}
 _SUCCESS_CODES = {SUCCESS, FAILURE}
 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
 _UNIT_CODES = _LITERAL_CODES | {ANY, IN}
@@ -140,6 +140,8 @@ def _compile(code, pattern, flags):
             if _simple(av[2]):
                 if op is MAX_REPEAT:
                     emit(REPEAT_ONE)
+                elif op is POSSESSIVE_REPEAT:
+                    emit(POSSESSIVE_REPEAT_ONE)
                 else:
                     emit(MIN_REPEAT_ONE)
                 skip = _len(code); emit(0)
@@ -149,13 +151,18 @@ def _compile(code, pattern, flags):
                 emit(SUCCESS)
                 code[skip] = _len(code) - skip
             else:
-                emit(REPEAT)
+                if op is POSSESSIVE_REPEAT:
+                    emit(POSSESSIVE_REPEAT)
+                else:
+                    emit(REPEAT)
                 skip = _len(code); emit(0)
                 emit(av[0])
                 emit(av[1])
                 _compile(code, av[2], flags)
                 code[skip] = _len(code) - skip
-                if op is MAX_REPEAT:
+                if op is POSSESSIVE_REPEAT:
+                    emit(SUCCESS)
+                elif op is MAX_REPEAT:
                     emit(MAX_UNTIL)
                 else:
                     emit(MIN_UNTIL)
@@ -169,6 +176,17 @@ def _compile(code, pattern, flags):
             if group:
                 emit(MARK)
                 emit((group-1)*2+1)
+        elif op is ATOMIC_GROUP:
+            # Atomic Groups are handled by starting with an Atomic
+            # Group op code, then putting in the atomic group pattern
+            # and finally a success op code to tell any repeat
+            # operations within the Atomic Group to stop eating and
+            # pop their stack if they reach it
+            emit(ATOMIC_GROUP)
+            skip = _len(code); emit(0)
+            _compile(code, av, flags)
+            emit(SUCCESS)
+            code[skip] = _len(code) - skip
         elif op in SUCCESS_CODES:
             emit(op)
         elif op in ASSERT_CODES:
@@ -709,7 +727,8 @@ def print_2(*args):
                     else:
                         print_(FAILURE)
                 i += 1
-            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
+            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
+                        POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
                 skip, min, max = code[i: i+3]
                 if max == MAXREPEAT:
                     max = 'MAXREPEAT'
@@ -725,6 +744,11 @@ def print_2(*args):
                 print_(op, skip, arg, to=i+skip)
                 dis_(i+2, i+skip)
                 i += skip
+            elif op is ATOMIC_GROUP:
+                skip = code[i]
+                print_(op, skip, to=i+skip)
+                dis_(i+1, i+skip)
+                i += skip
             elif op is INFO:
                 skip, flags, min, max = code[i: i+4]
                 if max == MAXREPEAT:

diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20171005
+MAGIC = 20220318
 
 from _sre import MAXREPEAT, MAXGROUPS
 
@@ -97,6 +97,9 @@ def _makecodes(names):
     REPEAT_ONE
     SUBPATTERN
     MIN_REPEAT_ONE
+    ATOMIC_GROUP
+    POSSESSIVE_REPEAT
+    POSSESSIVE_REPEAT_ONE
 
     GROUPREF_IGNORE
     IN_IGNORE
@@ -144,7 +147,6 @@ def _makecodes(names):
     CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK
 """)
 
-
 # replacement operations for "ignore case" mode
 OP_IGNORE = {
     LITERAL: LITERAL_IGNORE,

diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
@@ -25,7 +25,7 @@
 
 WHITESPACE = frozenset(" \t\n\r\v\f")
 
-_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
+_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
 
 ESCAPES = {
@@ -190,6 +190,10 @@ def getwidth(self):
                 i, j = av.getwidth()
                 lo = lo + i
                 hi = hi + j
+            elif op is ATOMIC_GROUP:
+                i, j = av.getwidth()
+                lo = lo + i
+                hi = hi + j
             elif op is SUBPATTERN:
                 i, j = av[-1].getwidth()
                 lo = lo + i
@@ -675,16 +679,22 @@ def _parse(source, state, verbose, nested, first=False):
                 if group is None and not add_flags and not del_flags:
                     item = p
             if sourcematch("?"):
+                # Non-Greedy Match
                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
+            elif sourcematch("+"):
+                # Possessive Match (Always Greedy)
+                subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
             else:
+                # Greedy Match
                 subpattern[-1] = (MAX_REPEAT, (min, max, item))
 
         elif this == ".":
             subpatternappend((ANY, None))
 
         elif this == "(":
             start = source.tell() - 1
-            group = True
+            capture = True
+            atomic = False
             name = None
             add_flags = 0
             del_flags = 0
@@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False):
                                            len(char) + 2)
                 elif char == ":":
                     # non-capturing group
-                    group = None
+                    capture = False
                 elif char == "#":
                     # comment
                     while True:
@@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False):
                     subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
                     continue
 
+                elif char == ">":
+                    # non-capturing, atomic group
+                    capture = False
+                    atomic = True
                 elif char in FLAGS or char == "-":
                     # flags
                     flags = _parse_flags(source, state, char)
@@ -818,17 +832,19 @@ def _parse(source, state, verbose, nested, first=False):
                         continue
 
                     add_flags, del_flags = flags
-                    group = None
+                    capture = False
                 else:
                     raise source.error("unknown extension ?" + char,
                                        len(char) + 1)
 
             # parse group contents
-            if group is not None:
+            if capture:
                 try:
                     group = state.opengroup(name)
                 except error as err:
                     raise source.error(err.msg, len(name) + 1) from None
+            else:
+                group = None
             sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
                            not (del_flags & SRE_FLAG_VERBOSE))
             p = _parse_sub(source, state, sub_verbose, nested + 1)
@@ -837,7 +853,11 @@ def _parse(source, state, verbose, nested, first=False):
                                    source.tell() - start)
             if group is not None:
                 state.closegroup(group, p)
-            subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
+            if atomic:
+                assert group is None
+                subpatternappend((ATOMIC_GROUP, p))
+            else:
+                subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
 
         elif this == "^":
             subpatternappend((AT, AT_BEGINNING))