diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 80fb8c3f929f6b..ee1189f95ca200 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -8,6 +8,7 @@
# and CDATA (character data -- only end tags are special).
+import string
import re
import _markupbase
@@ -105,6 +106,10 @@ def _replace_attr_charref(match):
def _unescape_attrvalue(s):
return attr_charref.sub(_replace_attr_charref, s)
+def _ascii_lower(s, *, table=str.maketrans(string.ascii_uppercase,
+ string.ascii_lowercase)):
+ return s.translate(table)
+
class HTMLParser(_markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
@@ -179,7 +184,7 @@ def get_starttag_text(self):
return self.__starttag_text
def set_cdata_mode(self, elem, *, escapable=False):
- self.cdata_elem = elem.lower()
+ self.cdata_elem = _ascii_lower(elem)
self._escapable = escapable
if self.cdata_elem == 'plaintext':
self.interesting = re.compile(r'\z')
@@ -284,7 +289,7 @@ def goahead(self, end):
self.handle_comment(rawdata[i+4:j])
elif startswith("
gtpos = rawdata.find('>', i+9)
if gtpos == -1:
@@ -438,7 +443,7 @@ def parse_starttag(self, i):
match = tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
- self.lasttag = tag = match.group(1).lower()
+ self.lasttag = tag = _ascii_lower(match.group(1))
while k < endpos:
m = attrfind_tolerant.match(rawdata, k)
if not m:
@@ -451,7 +456,7 @@ def parse_starttag(self, i):
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = _unescape_attrvalue(attrvalue)
- attrs.append((attrname.lower(), attrvalue))
+ attrs.append((_ascii_lower(attrname), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
@@ -507,7 +512,7 @@ def parse_endtag(self, i):
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
match = tagfind_tolerant.match(rawdata, i+2)
assert match
- tag = match.group(1).lower()
+ tag = _ascii_lower(match.group(1))
self.handle_endtag(tag)
self.clear_cdata_mode()
return j
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index e4eff1ea17a670..b061027883a991 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -533,6 +533,37 @@ def test_invalid_nonascii_closing_tag(self, tag, endtag):
("endtag", tag),
], collector=EventCollector(convert_charrefs=False, scripting=True))
+ @support.subTests('tag,converted', [
+ ('TıTLE', 'tıtle'),
+ ('NOFRAMEſ', 'noframeſ'),
+ ('NOſCRIPT', 'noſcript'),
+ ('NOSCRıPT', 'noscrıpt'),
+ ('SCRıPT', 'scrıpt'),
+ ('ADDREß', 'addreß'),
+ ('DATALIst', 'datalist'),
+ ('Lı', 'lı'),
+ ('LINK', 'linK'),
+ ])
+ def test_nonascii_tag(self, tag, converted):
+ # Starts with ASCII letter
+ source = f"<{tag}>
{tag}>"
+ self._run_check(source, [
+ ("starttag", converted, []),
+ ("starttag", "br", []),
+ ("endtag", converted),
+ ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
+ @support.subTests('tag', ['ſtyle', 'ſtyle', 'style', 'ıframe', 'ſcript',
+ 'ı', 'KBD', 'ſMALL', 'stRONG'])
+ def test_invalid_nonascii_tag(self, tag):
+ # Starts with non-ASCII letter
+ source = f"<{tag}>
{tag}>"
+ self._run_check(source, [
+ ("data", f"<{tag}>"),
+ ("starttag", "br", []),
+ ("comment", f"{tag}"),
+ ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
@support.subTests('tail,end', [
('', False),
('<', False),
@@ -1068,7 +1099,7 @@ def test_attr_values(self):
"",
[("starttag", "a", [("href", "mailto:xyz@example.com")])])
- def test_attr_nonascii(self):
+ def test_attr_value_nonascii(self):
# see issue 7311
self._run_check(
"
",
@@ -1083,6 +1114,16 @@ def test_attr_nonascii(self):
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")])])
+ def test_attr_name_nonascii(self):
+ self._run_check(
+ '