gh-106628: email parsing speedup (gh-106629)

python · Jul 13, 2023 · 7e6ce48 · 7e6ce48
1 parent af51bd7
commit 7e6ce48
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 6 deletions.
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
@@ -37,6 +37,8 @@
 headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
 EMPTYSTRING = ''
 NL = '\n'
+boundaryendRE = re.compile(
+    r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
 
 NeedMoreData = object()
 
@@ -327,9 +329,10 @@ def _parsegen(self):
             # this onto the input stream until we've scanned past the
             # preamble.
             separator = '--' + boundary
-            boundaryre = re.compile(
-                '(?P<sep>' + re.escape(separator) +
-                r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
+            def boundarymatch(line):
+                if not line.startswith(separator):
+                    return None
+                return boundaryendRE.match(line, len(separator))
             capturing_preamble = True
             preamble = []
             linesep = False
@@ -341,7 +344,7 @@ def _parsegen(self):
                     continue
                 if line == '':
                     break
-                mo = boundaryre.match(line)
+                mo = boundarymatch(line)
                 if mo:
                     # If we're looking at the end boundary, we're done with
                     # this multipart.  If there was a newline at the end of
@@ -373,13 +376,13 @@ def _parsegen(self):
                         if line is NeedMoreData:
                             yield NeedMoreData
                             continue
-                        mo = boundaryre.match(line)
+                        mo = boundarymatch(line)
                         if not mo:
                             self._input.unreadline(line)
                             break
                     # Recurse to parse this subpart; the input stream points
                     # at the subpart's first line.
-                    self._input.push_eof_matcher(boundaryre.match)
+                    self._input.push_eof_matcher(boundarymatch)
                     for retval in self._parsegen():
                         if retval is NeedMoreData:
                             yield NeedMoreData

diff --git a/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
@@ -0,0 +1,2 @@
+Speed up parsing of emails by about 20% by not compiling a new regular
+expression for every single email.