From d4e028df371337c764a689d7c94031b6c98823f8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 19 Nov 2025 19:52:09 +0200 Subject: [PATCH] gh-141756: Fix handling non-ASCII element and attribute names in HTMLParser Only ASCII letters are now converted to lower case. --- Lib/html/parser.py | 17 +++++--- Lib/test/test_htmlparser.py | 43 ++++++++++++++++++- ...-11-19-19-52-05.gh-issue-141756.7Xcg90.rst | 3 ++ 3 files changed, 56 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-11-19-19-52-05.gh-issue-141756.7Xcg90.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 80fb8c3f929f6b..ee1189f95ca200 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -8,6 +8,7 @@ # and CDATA (character data -- only end tags are special). +import string import re import _markupbase @@ -105,6 +106,10 @@ def _replace_attr_charref(match): def _unescape_attrvalue(s): return attr_charref.sub(_replace_attr_charref, s) +def _ascii_lower(s, *, table=str.maketrans(string.ascii_uppercase, + string.ascii_lowercase)): + return s.translate(table) + class HTMLParser(_markupbase.ParserBase): """Find tags and other markup and call handler functions. @@ -179,7 +184,7 @@ def get_starttag_text(self): return self.__starttag_text def set_cdata_mode(self, elem, *, escapable=False): - self.cdata_elem = elem.lower() + self.cdata_elem = _ascii_lower(elem) self._escapable = escapable if self.cdata_elem == 'plaintext': self.interesting = re.compile(r'\z') @@ -284,7 +289,7 @@ def goahead(self, end): self.handle_comment(rawdata[i+4:j]) elif startswith(" gtpos = rawdata.find('>', i+9) if gtpos == -1: @@ -438,7 +443,7 @@ def parse_starttag(self, i): match = tagfind_tolerant.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() - self.lasttag = tag = match.group(1).lower() + self.lasttag = tag = _ascii_lower(match.group(1)) while k < endpos: m = attrfind_tolerant.match(rawdata, k) if not m: @@ -451,7 +456,7 @@ def parse_starttag(self, i): attrvalue = attrvalue[1:-1] if attrvalue: attrvalue = _unescape_attrvalue(attrvalue) - attrs.append((attrname.lower(), attrvalue)) + attrs.append((_ascii_lower(attrname), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() @@ -507,7 +512,7 @@ def parse_endtag(self, i): # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state match = tagfind_tolerant.match(rawdata, i+2) assert match - tag = match.group(1).lower() + tag = _ascii_lower(match.group(1)) self.handle_endtag(tag) self.clear_cdata_mode() return j diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index e4eff1ea17a670..b061027883a991 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -533,6 +533,37 @@ def test_invalid_nonascii_closing_tag(self, tag, endtag): ("endtag", tag), ], collector=EventCollector(convert_charrefs=False, scripting=True)) + @support.subTests('tag,converted', [ + ('TıTLE', 'tıtle'), + ('NOFRAMEſ', 'noframeſ'), + ('NOſCRIPT', 'noſcript'), + ('NOSCRıPT', 'noscrıpt'), + ('SCRıPT', 'scrıpt'), + ('ADDREß', 'addreß'), + ('DATALIst', 'datalist'), + ('Lı', 'lı'), + ('LINK', 'linK'), + ]) + def test_nonascii_tag(self, tag, converted): + # Starts with ASCII letter + source = f"<{tag}>
" + self._run_check(source, [ + ("starttag", converted, []), + ("starttag", "br", []), + ("endtag", converted), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + + @support.subTests('tag', ['ſtyle', 'ſtyle', 'style', 'ıframe', 'ſcript', + 'ı', 'KBD', 'ſMALL', 'stRONG']) + def test_invalid_nonascii_tag(self, tag): + # Starts with non-ASCII letter + source = f"<{tag}>
" + self._run_check(source, [ + ("data", f"<{tag}>"), + ("starttag", "br", []), + ("comment", f"{tag}"), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + @support.subTests('tail,end', [ ('', False), ('<', False), @@ -1068,7 +1099,7 @@ def test_attr_values(self): "", [("starttag", "a", [("href", "mailto:xyz@example.com")])]) - def test_attr_nonascii(self): + def test_attr_value_nonascii(self): # see issue 7311 self._run_check( "\u4e2d\u6587", @@ -1083,6 +1114,16 @@ def test_attr_nonascii(self): [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), ("href", "\u30c6\u30b9\u30c8.html")])]) + def test_attr_name_nonascii(self): + self._run_check( + '