diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 38ddf9ef442d36..fbe0d3665e073c 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -157,6 +157,9 @@ def reset(self): self.cdata_elem = None self._support_cdata = True self._escapable = True + self._pending = [] + self._pending_len = 0 + self._parse_threshold = 1 super().reset() def feed(self, data): @@ -165,11 +168,36 @@ def feed(self, data): Call this as often as you want, with as little or as much text as you want (may include '\n'). """ - self.rawdata = self.rawdata + data - self.goahead(0) + # Accumulate new data in a list and only join and parse it once + # enough has piled up. Rescanning an unparsed buffer (e.g. an + # unterminated tag) and concatenating onto it on every call would + # both be quadratic in the input size. + self._pending_len += len(data) + if self._pending_len < self._parse_threshold: + self._pending.append(data) + else: + if not self._pending: + self.rawdata += data + else: + self._pending.append(data) + self.rawdata += ''.join(self._pending) + self._pending.clear() + self._pending_len = 0 + n = len(self.rawdata) + self.goahead(0) + if len(self.rawdata) < n: + # Some data was parsed; resume on the next call. + self._parse_threshold = 1 + else: + # Nothing was parsed; wait until the buffer doubles. + self._parse_threshold = len(self.rawdata) def close(self): """Handle any buffered data.""" + if self._pending: + self.rawdata += ''.join(self._pending) + self._pending.clear() + self._pending_len = 0 self.goahead(1) __starttag_text = None diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 6b7624f11505d9..3fdaed4ff46b9d 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -1041,6 +1041,26 @@ def check(source): check("") # comment + check("") # processing instruction + check("") # doctype + check("") # CDATA section + check("") # start tag + check("") # RAWTEXT element + class AttributesTestCase(TestCaseBase): diff --git a/Misc/NEWS.d/next/Security/2026-07-04-17-00-00.gh-issue-153030.RovkP6.rst b/Misc/NEWS.d/next/Security/2026-07-04-17-00-00.gh-issue-153030.RovkP6.rst new file mode 100644 index 00000000000000..d1d60593f4ba7d --- /dev/null +++ b/Misc/NEWS.d/next/Security/2026-07-04-17-00-00.gh-issue-153030.RovkP6.rst @@ -0,0 +1,3 @@ +Fixed quadratic complexity in incremental parsing of long unterminated +constructs (such as tags or comments) in :class:`html.parser.HTMLParser`, +which could be exploited for a denial of service.