From c32a33286bac6717eae0fe623c870a0bf8668862 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 31 Oct 2025 17:44:02 +0200 Subject: [PATCH] gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser (GH-137837) * the "plaintext" element * the RAWTEXT elements "xmp", "iframe", "noembed" and "noframes" * optionally RAWTEXT (if scripting=True) element "noscript" (cherry picked from commit a17c57eee5b5cc81390750d07e4800b19c0c3084) Co-authored-by: Serhiy Storchaka --- Doc/library/html.parser.rst | 33 +-- Lib/html/parser.py | 24 +- Lib/test/test_htmlparser.py | 217 ++++++++++-------- ...-08-15-23-08-44.gh-issue-137836.b55rhh.rst | 3 + 4 files changed, 163 insertions(+), 114 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index dd67fc34e856f1..341a8337ba2ceb 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -15,14 +15,18 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(*, convert_charrefs=True) +.. class:: HTMLParser(*, convert_charrefs=True, scripting=False) Create a parser instance able to parse invalid markup. - If *convert_charrefs* is ``True`` (the default), all character - references (except the ones in ``script``/``style`` elements) are + If *convert_charrefs* is true (the default), all character + references (except the ones in elements like ``script`` and ``style``) are automatically converted to the corresponding Unicode characters. + If *scripting* is false (the default), the content of the ``noscript`` + element is parsed normally; if it's true, it's returned as is without + being parsed. + An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are encountered. The user should subclass :class:`.HTMLParser` and override its @@ -37,6 +41,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. .. versionchanged:: 3.5 The default value for argument *convert_charrefs* is now ``True``. + .. versionchanged:: 3.14.1 + Added the *scripting* parameter. + Example HTML Parser Application ------------------------------- @@ -161,15 +168,15 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): .. method:: HTMLParser.handle_data(data) This method is called to process arbitrary data (e.g. text nodes and the - content of ```` and ````). + content of elements like ``script`` and ``style``). .. method:: HTMLParser.handle_entityref(name) This method is called to process a named character reference of the form ``&name;`` (e.g. ``>``), where *name* is a general entity reference - (e.g. ``'gt'``). This method is never called if *convert_charrefs* is - ``True``. + (e.g. ``'gt'``). + This method is only called if *convert_charrefs* is false. .. method:: HTMLParser.handle_charref(name) @@ -177,8 +184,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): This method is called to process decimal and hexadecimal numeric character references of the form :samp:`&#{NNN};` and :samp:`&#x{NNN};`. For example, the decimal equivalent for ``>`` is ``>``, whereas the hexadecimal is ``>``; - in this case the method will receive ``'62'`` or ``'x3E'``. This method - is never called if *convert_charrefs* is ``True``. + in this case the method will receive ``'62'`` or ``'x3E'``. + This method is only called if *convert_charrefs* is false. .. method:: HTMLParser.handle_comment(data) @@ -292,8 +299,8 @@ Parsing an element with a few attributes and a title: Data : Python End tag : h1 -The content of ``script`` and ``style`` elements is returned as is, without -further parsing: +The content of elements like ``script`` and ``style`` is returned as is, +without further parsing: .. doctest:: @@ -304,10 +311,10 @@ further parsing: End tag : style >>> parser.feed('') + ... 'alert("hello! ☺");') Start tag: script attr: ('type', 'text/javascript') - Data : alert("hello!"); + Data : alert("hello! ☺"); End tag : script Parsing comments: @@ -336,7 +343,7 @@ correct char (note: these 3 references are all equivalent to ``'>'``): Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but :meth:`~HTMLParser.handle_data` might be called more than once -(unless *convert_charrefs* is set to ``True``): +if *convert_charrefs* is false: .. doctest:: diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 5d7050dad2396b..e50620de800d63 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -127,17 +127,25 @@ class HTMLParser(_markupbase.ParserBase): argument. """ - CDATA_CONTENT_ELEMENTS = ("script", "style") + # See the HTML5 specs section "13.4 Parsing HTML fragments". + # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments + # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode + CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes") RCDATA_CONTENT_ELEMENTS = ("textarea", "title") - def __init__(self, *, convert_charrefs=True): + def __init__(self, *, convert_charrefs=True, scripting=False): """Initialize and reset this instance. - If convert_charrefs is True (the default), all character references + If convert_charrefs is true (the default), all character references are automatically converted to the corresponding Unicode characters. + + If *scripting* is false (the default), the content of the + ``noscript`` element is parsed normally; if it's true, + it's returned as is without being parsed. """ super().__init__() self.convert_charrefs = convert_charrefs + self.scripting = scripting self.reset() def reset(self): @@ -172,7 +180,9 @@ def get_starttag_text(self): def set_cdata_mode(self, elem, *, escapable=False): self.cdata_elem = elem.lower() self._escapable = escapable - if escapable and not self.convert_charrefs: + if self.cdata_elem == 'plaintext': + self.interesting = re.compile(r'\z') + elif escapable and not self.convert_charrefs: self.interesting = re.compile(r'&|])' % self.cdata_elem, re.IGNORECASE|re.ASCII) else: @@ -444,8 +454,10 @@ def parse_starttag(self, i): self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) + if (tag in self.CDATA_CONTENT_ELEMENTS or + (self.scripting and tag == "noscript") or + tag == "plaintext"): + self.set_cdata_mode(tag, escapable=False) elif tag in self.RCDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag, escapable=True) return endpos diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 6a1d69335a0616..19dde9362a43b6 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -8,6 +8,18 @@ from test import support +SAMPLE_RCDATA = ( + '' + "" + '' + '' + '' + '\u2603' +) + +SAMPLE_RAWTEXT = SAMPLE_RCDATA + '&☺' + + class EventCollector(html.parser.HTMLParser): def __init__(self, *args, autocdata=False, **kw): @@ -293,30 +305,20 @@ def test_get_starttag_text(self): 'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'), '\n\n', '', - 'foo = ""', - 'foo = ""', - 'foo = ""', - 'foo = ""', - 'foo = ""', - 'foo = ""', ]) def test_script_content(self, content): s = f'' - self._run_check(s, [("starttag", "script", []), - ("data", content), - ("endtag", "script")]) + self._run_check(s, [ + ("starttag", "script", []), + ("data", content), + ("endtag", "script"), + ]) @support.subTests('content', [ 'a::before { content: ""; }', 'a::before { content: "¬-an-entity-ref;"; }', 'a::before { content: ""; }', 'a::before { content: "\u2603"; }', - 'a::before { content: "< /style>"; }', - 'a::before { content: ""; }', - 'a::before { content: ""; }', - 'a::before { content: ""; }', - 'a::before { content: ""; }', - 'a::before { content: ""; }', ]) def test_style_content(self, content): s = f'' @@ -324,47 +326,59 @@ def test_style_content(self, content): ("data", content), ("endtag", "style")]) - @support.subTests('content', [ - '', - "", - '', - '', - '', - '\u2603', - '< /title>', - '', - '', - '', - '', - '', + @support.subTests('tag', ['title', 'textarea']) + def test_rcdata_content(self, tag): + source = f"<{tag}>{SAMPLE_RCDATA}" + self._run_check(source, [ + ("starttag", tag, []), + ("data", SAMPLE_RCDATA), + ("endtag", tag), ]) - def test_title_content(self, content): - source = f"{content}" + source = f"<{tag}>&" self._run_check(source, [ - ("starttag", "title", []), - ("data", content), - ("endtag", "title"), + ("starttag", tag, []), + ('entityref', 'amp'), + ("endtag", tag), ]) - @support.subTests('content', [ - '', - "", - '', - '', - '', - '\u2603', - '< /textarea>', - '', - '', - '', - '', + @support.subTests('tag', + ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script']) + def test_rawtext_content(self, tag): + source = f"<{tag}>{SAMPLE_RAWTEXT}" + self._run_check(source, [ + ("starttag", tag, []), + ("data", SAMPLE_RAWTEXT), + ("endtag", tag), + ]) + + def test_noscript_content(self): + source = f"" + # scripting=False -- normal mode + self._run_check(source, [ + ('starttag', 'noscript', []), + ('comment', ' not a comment '), + ('starttag', 'not', [('a', 'start tag')]), + ('unknown decl', 'CDATA[not a cdata'), + ('comment', 'not a bogus comment'), + ('endtag', 'not'), + ('data', '☃'), + ('entityref', 'amp'), + ('charref', '9786'), + ('endtag', 'noscript'), ]) - def test_textarea_content(self, content): - source = f"" + # scripting=True -- RAWTEXT mode + self._run_check(source, [ + ("starttag", "noscript", []), + ("data", SAMPLE_RAWTEXT), + ("endtag", "noscript"), + ], collector=EventCollector(scripting=True)) + + def test_plaintext_content(self): + content = SAMPLE_RAWTEXT + '' # not closing + source = f"{content}" self._run_check(source, [ - ("starttag", "textarea", []), + ("starttag", "plaintext", []), ("data", content), - ("endtag", "textarea"), ]) @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n', @@ -381,52 +395,65 @@ def test_script_closing_tag(self, endtag): ("endtag", "script")], collector=EventCollectorNoNormalize(convert_charrefs=False)) - @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n', - 'style/', 'style foo=bar', 'style foo=">"']) - def test_style_closing_tag(self, endtag): - content = """ - b::before { content: "<!-- not a comment -->"; } - p::before { content: "&not-an-entity-ref;"; } - a::before { content: "<i>"; } - a::after { content: "</i>"; } - """ - s = f'<StyLE>{content}</{endtag}>' - self._run_check(s, [("starttag", "style", []), - ("data", content), - ("endtag", "style")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) - - @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n', - 'title/', 'title foo=bar', 'title foo=">"']) - def test_title_closing_tag(self, endtag): - content = "<!-- not a comment --><i>Egg &amp; Spam</i>" - s = f'<TitLe>{content}</{endtag}>' - self._run_check(s, [("starttag", "title", []), - ('data', '<!-- not a comment --><i>Egg & Spam</i>'), - ("endtag", "title")], - collector=EventCollectorNoNormalize(convert_charrefs=True)) - self._run_check(s, [("starttag", "title", []), - ('data', '<!-- not a comment --><i>Egg '), - ('entityref', 'amp'), - ('data', ' Spam</i>'), - ("endtag", "title")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) - - @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n', - 'textarea/', 'textarea foo=bar', 'textarea foo=">"']) - def test_textarea_closing_tag(self, endtag): - content = "<!-- not a comment --><i>Egg &amp; Spam</i>" - s = f'<TexTarEa>{content}</{endtag}>' - self._run_check(s, [("starttag", "textarea", []), - ('data', '<!-- not a comment --><i>Egg & Spam</i>'), - ("endtag", "textarea")], - collector=EventCollectorNoNormalize(convert_charrefs=True)) - self._run_check(s, [("starttag", "textarea", []), - ('data', '<!-- not a comment --><i>Egg '), - ('entityref', 'amp'), - ('data', ' Spam</i>'), - ("endtag", "textarea")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('tag', [ + 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', + 'textarea', 'title', 'noscript', + ]) + def test_closing_tag(self, tag): + for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n', + f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']: + content = "<!-- not a comment --><i>Spam</i>" + s = f'<{tag.upper()}>{content}</{endtag}>' + self._run_check(s, [ + ("starttag", tag, []), + ('data', content), + ("endtag", tag), + ], collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True)) + + @support.subTests('tag', [ + 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', + 'textarea', 'title', 'noscript', + ]) + def test_invalid_closing_tag(self, tag): + content = ( + f'< /{tag}>' + f'</ {tag}>' + f'</{tag}x>' + f'</{tag}\v>' + f'</{tag}\xa0>' + ) + source = f"<{tag}>{content}</{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ("endtag", tag), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + + @support.subTests('tag,endtag', [ + ('title', 'tıtle'), + ('style', 'ſtyle'), + ('style', 'ſtyle'), + ('style', 'style'), + ('iframe', 'ıframe'), + ('noframes', 'noframeſ'), + ('noscript', 'noſcript'), + ('noscript', 'noscrıpt'), + ('script', 'ſcript'), + ('script', 'scrıpt'), + ]) + def test_invalid_nonascii_closing_tag(self, tag, endtag): + content = f"<br></{endtag}>" + source = f"<{tag}>{content}" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + source = f"<{tag}>{content}</{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ("endtag", tag), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) @support.subTests('tail,end', [ ('', False), diff --git a/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst new file mode 100644 index 00000000000000..c30c9439a76a19 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst @@ -0,0 +1,3 @@ +Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe", +"noembed" and "noframes", and optionally RAWTEXT element "noscript" in +:class:`html.parser.HTMLParser`.