Skip to content

Commit

Permalink
pythongh-109747: Improve errors for unsupported look-behind patterns
Browse files Browse the repository at this point in the history
Now re.error is raised instead of OverflowError or RuntimeError for
too large width of look-behind pattern.

The limit is increased to 2**32-1 (was 2**31-1).
  • Loading branch information
serhiy-storchaka committed Sep 25, 2023
1 parent e81bd3f commit 940b85f
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 13 deletions.
4 changes: 3 additions & 1 deletion Lib/re/_compiler.py
Expand Up @@ -147,6 +147,8 @@ def _compile(code, pattern, flags):
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
if lo > MAXCODE:
raise error("looks too much behind")
if lo != hi:
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
Expand Down Expand Up @@ -547,7 +549,7 @@ def _compile_info(code, pattern, flags):
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
emit(min(hi, MAXCODE))
emit(hi)
# add literal prefix
if prefix:
emit(len(prefix)) # length
Expand Down
13 changes: 10 additions & 3 deletions Lib/re/_parser.py
Expand Up @@ -67,6 +67,10 @@
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
GLOBAL_FLAGS = SRE_FLAG_DEBUG

# Maximal value returned by SubPattern.getwidth().
# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
MAXWIDTH = 1 << 64

class State:
# keeps track of state for parsing
def __init__(self):
Expand Down Expand Up @@ -177,7 +181,7 @@ def getwidth(self):
lo = hi = 0
for op, av in self.data:
if op is BRANCH:
i = MAXREPEAT - 1
i = MAXWIDTH
j = 0
for av in av[1]:
l, h = av.getwidth()
Expand All @@ -196,7 +200,10 @@ def getwidth(self):
elif op in _REPEATCODES:
i, j = av[2].getwidth()
lo = lo + i * av[0]
hi = hi + j * av[1]
if av[1] == MAXREPEAT and j:
hi = MAXWIDTH
else:
hi = hi + j * av[1]
elif op in _UNITCODES:
lo = lo + 1
hi = hi + 1
Expand All @@ -216,7 +223,7 @@ def getwidth(self):
hi = hi + j
elif op is SUCCESS:
break
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
return self.width

class Tokenizer:
Expand Down
23 changes: 23 additions & 0 deletions Lib/test/test_re.py
Expand Up @@ -1861,6 +1861,29 @@ def test_repeat_minmax_overflow(self):
self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))

def test_look_behind_overflow(self):
string = "x" * 2_500_000
p1 = r"(?<=((.{%d}){%d}){%d})"
p2 = r"(?<!((.{%d}){%d}){%d})"
# Test that the templates are valid and look-behind with width 2**21
# (larger than sys.maxunicode) are supported.
self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(),
(2**21, 2**21))
self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(),
(0, 0))
# Test that 2**22 is accepted as a repetition number and look-behind
# width.
re.compile(p1 % (2**22, 1, 1))
re.compile(p1 % (1, 2**22, 1))
re.compile(p1 % (1, 1, 2**22))
re.compile(p2 % (2**22, 1, 1))
re.compile(p2 % (1, 2**22, 1))
re.compile(p2 % (1, 1, 2**22))
# But 2**66 is too large for look-behind width.
errmsg = "looks too much behind"
self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22))
self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22))

def test_backref_group_name_in_exception(self):
# Issue 17341: Poor error message when compiling invalid regex
self.checkPatternError('(?P=<foo>)',
Expand Down
@@ -0,0 +1,3 @@
Improve errors for unsupported look-behind patterns. Now re.error is raised
instead of OverflowError or RuntimeError for too large width of look-behind
pattern.
2 changes: 0 additions & 2 deletions Modules/_sre/sre.c
Expand Up @@ -2067,8 +2067,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
GET_SKIP;
GET_ARG; /* 0 for lookahead, width for lookbehind */
code--; /* Back up over arg to simplify math below */
if (arg & 0x80000000)
FAIL; /* Width too large */
/* Stop 1 before the end; we check the SUCCESS below */
if (_validate_inner(code+1, code+skip-2, groups))
FAIL;
Expand Down
14 changes: 7 additions & 7 deletions Modules/_sre/sre_lib.h
Expand Up @@ -591,8 +591,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
/* optimization info block */
/* <INFO> <1=skip> <2=flags> <3=min> ... */
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
TRACE(("reject (got %zd chars, need %zd)\n",
end - ptr, (Py_ssize_t) pattern[3]));
TRACE(("reject (got %tu chars, need %zu)\n",
end - ptr, (size_t) pattern[3]));
RETURN_FAILURE;
}
pattern += pattern[1] + 1;
Expand Down Expand Up @@ -1509,7 +1509,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
/* <ASSERT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT %d\n", pattern,
ptr, pattern[1]));
if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1])
RETURN_FAILURE;
state->ptr = ptr - pattern[1];
DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
Expand All @@ -1522,7 +1522,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
/* <ASSERT_NOT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
ptr, pattern[1]));
if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) {
state->ptr = ptr - pattern[1];
LASTMARK_SAVE();
if (state->repeat)
Expand Down Expand Up @@ -1656,9 +1656,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)

flags = pattern[2];

if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
TRACE(("reject (got %u chars, need %u)\n",
(unsigned int)(end - ptr), pattern[3]));
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
TRACE(("reject (got %tu chars, need %zu)\n",
end - ptr, (size_t) pattern[3]));
return 0;
}
if (pattern[3] > 1) {
Expand Down

0 comments on commit 940b85f

Please sign in to comment.