diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index d0a4c55caf6e41..f87712d6d6f9f8 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -147,6 +147,8 @@ def _compile(code, pattern, flags): emit(0) # look ahead else: lo, hi = av[1].getwidth() + if lo > MAXCODE: + raise error("looks too much behind") if lo != hi: raise error("look-behind requires fixed-width pattern") emit(lo) # look behind @@ -547,7 +549,7 @@ def _compile_info(code, pattern, flags): else: emit(MAXCODE) prefix = prefix[:MAXCODE] - emit(min(hi, MAXCODE)) + emit(hi) # add literal prefix if prefix: emit(len(prefix)) # length diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index d00b7e67d55958..f3c779340fe230 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -67,6 +67,10 @@ TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE GLOBAL_FLAGS = SRE_FLAG_DEBUG +# Maximal value returned by SubPattern.getwidth(). +# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize. +MAXWIDTH = 1 << 64 + class State: # keeps track of state for parsing def __init__(self): @@ -177,7 +181,7 @@ def getwidth(self): lo = hi = 0 for op, av in self.data: if op is BRANCH: - i = MAXREPEAT - 1 + i = MAXWIDTH j = 0 for av in av[1]: l, h = av.getwidth() @@ -196,7 +200,10 @@ def getwidth(self): elif op in _REPEATCODES: i, j = av[2].getwidth() lo = lo + i * av[0] - hi = hi + j * av[1] + if av[1] == MAXREPEAT and j: + hi = MAXWIDTH + else: + hi = hi + j * av[1] elif op in _UNITCODES: lo = lo + 1 hi = hi + 1 @@ -216,7 +223,7 @@ def getwidth(self): hi = hi + j elif op is SUCCESS: break - self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) + self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH) return self.width class Tokenizer: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 45bce1925f9e89..e30c7762de7a1d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1861,6 +1861,29 @@ def test_repeat_minmax_overflow(self): self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) + def test_look_behind_overflow(self): + string = "x" * 2_500_000 + p1 = r"(?<=((.{%d}){%d}){%d})" + p2 = r"(?)', diff --git a/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst b/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst new file mode 100644 index 00000000000000..b64ba627897a1a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst @@ -0,0 +1,3 @@ +Improve errors for unsupported look-behind patterns. Now re.error is raised +instead of OverflowError or RuntimeError for too large width of look-behind +pattern. diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 07da5da13f70d3..cf2156388105e3 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -2067,8 +2067,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) GET_SKIP; GET_ARG; /* 0 for lookahead, width for lookbehind */ code--; /* Back up over arg to simplify math below */ - if (arg & 0x80000000) - FAIL; /* Width too large */ /* Stop 1 before the end; we check the SUCCESS below */ if (_validate_inner(code+1, code+skip-2, groups)) FAIL; diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index ae80009fd63bbe..bacfe81e49f7a8 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -591,8 +591,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* optimization info block */ /* <1=skip> <2=flags> <3=min> ... */ if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) { - TRACE(("reject (got %zd chars, need %zd)\n", - end - ptr, (Py_ssize_t) pattern[3])); + TRACE(("reject (got %tu chars, need %zu)\n", + end - ptr, (size_t) pattern[3])); RETURN_FAILURE; } pattern += pattern[1] + 1; @@ -1509,7 +1509,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* */ TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1])); - if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1]) + if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1]) RETURN_FAILURE; state->ptr = ptr - pattern[1]; DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2); @@ -1522,7 +1522,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* */ TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1])); - if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) { + if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) { state->ptr = ptr - pattern[1]; LASTMARK_SAVE(); if (state->repeat) @@ -1656,9 +1656,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) flags = pattern[2]; - if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) { - TRACE(("reject (got %u chars, need %u)\n", - (unsigned int)(end - ptr), pattern[3])); + if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) { + TRACE(("reject (got %tu chars, need %zu)\n", + end - ptr, (size_t) pattern[3])); return 0; } if (pattern[3] > 1) {