Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ def reset(self):
self.cdata_elem = None
self._support_cdata = True
self._escapable = True
self._pending = []
self._pending_len = 0
self._parse_threshold = 1
super().reset()

def feed(self, data):
Expand All @@ -165,11 +168,36 @@ def feed(self, data):
Call this as often as you want, with as little or as much text
as you want (may include '\n').
"""
self.rawdata = self.rawdata + data
self.goahead(0)
# Accumulate new data in a list and only join and parse it once
# enough has piled up. Rescanning an unparsed buffer (e.g. an
# unterminated tag) and concatenating onto it on every call would
# both be quadratic in the input size.
self._pending_len += len(data)
if self._pending_len < self._parse_threshold:
self._pending.append(data)
else:
if not self._pending:
self.rawdata += data
else:
self._pending.append(data)
self.rawdata += ''.join(self._pending)
self._pending.clear()
self._pending_len = 0
n = len(self.rawdata)
self.goahead(0)
if len(self.rawdata) < n:
# Some data was parsed; resume on the next call.
self._parse_threshold = 1
else:
# Nothing was parsed; wait until the buffer doubles.
self._parse_threshold = len(self.rawdata)

def close(self):
"""Handle any buffered data."""
if self._pending:
self.rawdata += ''.join(self._pending)
self._pending.clear()
self._pending_len = 0
self.goahead(1)

__starttag_text = None
Expand Down
20 changes: 20 additions & 0 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,26 @@ def check(source):
check("<![CDATA[" * 9 * n)
check("<!doctype" * 35 * n)

@support.requires_resource('cpu')
def test_incremental_no_quadratic_complexity(self):
# An unterminated construct fed in many small chunks used to take
# quadratic time, both to rescan and to concatenate the buffer.
# Now it takes a fraction of a second.
def check(prefix, chunk, suffix):
parser = html.parser.HTMLParser()
parser.feed(prefix)
for _ in range(200_000):
parser.feed(chunk)
parser.feed(suffix)
parser.close()
chunk = "a" * 64
check("<!--", chunk, "-->") # comment
check("<?", chunk, ">") # processing instruction
check("<!doctype ", chunk, ">") # doctype
check("<![CDATA[", chunk, "]]>") # CDATA section
check("<a href='", chunk, "'>") # start tag
check("<script>", chunk, "</script>") # RAWTEXT element


class AttributesTestCase(TestCaseBase):

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fixed quadratic complexity in incremental parsing of long unterminated
constructs (such as tags or comments) in :class:`html.parser.HTMLParser`,
which could be exploited for a denial of service.
Loading