Skip to content

Commit

Permalink
gh-106628: email parsing speedup (gh-106629)
Browse files Browse the repository at this point in the history
  • Loading branch information
cfbolz committed Jul 13, 2023
1 parent af51bd7 commit 7e6ce48
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
15 changes: 9 additions & 6 deletions Lib/email/feedparser.py
Expand Up @@ -37,6 +37,8 @@
headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
EMPTYSTRING = ''
NL = '\n'
boundaryendRE = re.compile(
r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')

NeedMoreData = object()

Expand Down Expand Up @@ -327,9 +329,10 @@ def _parsegen(self):
# this onto the input stream until we've scanned past the
# preamble.
separator = '--' + boundary
boundaryre = re.compile(
'(?P<sep>' + re.escape(separator) +
r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
def boundarymatch(line):
if not line.startswith(separator):
return None
return boundaryendRE.match(line, len(separator))
capturing_preamble = True
preamble = []
linesep = False
Expand All @@ -341,7 +344,7 @@ def _parsegen(self):
continue
if line == '':
break
mo = boundaryre.match(line)
mo = boundarymatch(line)
if mo:
# If we're looking at the end boundary, we're done with
# this multipart. If there was a newline at the end of
Expand Down Expand Up @@ -373,13 +376,13 @@ def _parsegen(self):
if line is NeedMoreData:
yield NeedMoreData
continue
mo = boundaryre.match(line)
mo = boundarymatch(line)
if not mo:
self._input.unreadline(line)
break
# Recurse to parse this subpart; the input stream points
# at the subpart's first line.
self._input.push_eof_matcher(boundaryre.match)
self._input.push_eof_matcher(boundarymatch)
for retval in self._parsegen():
if retval is NeedMoreData:
yield NeedMoreData
Expand Down
@@ -0,0 +1,2 @@
Speed up parsing of emails by about 20% by not compiling a new regular
expression for every single email.

0 comments on commit 7e6ce48

Please sign in to comment.