Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# and CDATA (character data -- only end tags are special).


import string
import re
import _markupbase

Expand Down Expand Up @@ -105,6 +106,10 @@ def _replace_attr_charref(match):
def _unescape_attrvalue(s):
return attr_charref.sub(_replace_attr_charref, s)

def _ascii_lower(s, *, table=str.maketrans(string.ascii_uppercase,
string.ascii_lowercase)):
return s.translate(table)


class HTMLParser(_markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
Expand Down Expand Up @@ -179,7 +184,7 @@ def get_starttag_text(self):
return self.__starttag_text

def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self.cdata_elem = _ascii_lower(elem)
self._escapable = escapable
if self.cdata_elem == 'plaintext':
self.interesting = re.compile(r'\z')
Expand Down Expand Up @@ -284,7 +289,7 @@ def goahead(self, end):
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i) and self._support_cdata:
self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
elif _ascii_lower(rawdata[i:i+9]) == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
# bogus comment
Expand Down Expand Up @@ -372,7 +377,7 @@ def parse_html_declaration(self, i):
return -1
self.unknown_decl(rawdata[i+3: j])
return j + 3
elif rawdata[i:i+9].lower() == '<!doctype':
elif _ascii_lower(rawdata[i:i+9]) == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
if gtpos == -1:
Expand Down Expand Up @@ -438,7 +443,7 @@ def parse_starttag(self, i):
match = tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = match.group(1).lower()
self.lasttag = tag = _ascii_lower(match.group(1))
while k < endpos:
m = attrfind_tolerant.match(rawdata, k)
if not m:
Expand All @@ -451,7 +456,7 @@ def parse_starttag(self, i):
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
attrs.append((_ascii_lower(attrname), attrvalue))
k = m.end()

end = rawdata[k:endpos].strip()
Expand Down Expand Up @@ -507,7 +512,7 @@ def parse_endtag(self, i):
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
match = tagfind_tolerant.match(rawdata, i+2)
assert match
tag = match.group(1).lower()
tag = _ascii_lower(match.group(1))
self.handle_endtag(tag)
self.clear_cdata_mode()
return j
Expand Down
43 changes: 42 additions & 1 deletion Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,37 @@ def test_invalid_nonascii_closing_tag(self, tag, endtag):
("endtag", tag),
], collector=EventCollector(convert_charrefs=False, scripting=True))

@support.subTests('tag,converted', [
('TıTLE', 'tıtle'),
('NOFRAMEſ', 'noframeſ'),
('NOſCRIPT', 'noſcript'),
('NOSCRıPT', 'noscrıpt'),
('SCRıPT', 'scrıpt'),
('ADDREß', 'addreß'),
('DATALIst', 'datalist'),
('Lı', 'lı'),
('LINK', 'linK'),
])
def test_nonascii_tag(self, tag, converted):
# Starts with ASCII letter
source = f"<{tag}><br></{tag}>"
self._run_check(source, [
("starttag", converted, []),
("starttag", "br", []),
("endtag", converted),
], collector=EventCollector(convert_charrefs=False, scripting=True))

@support.subTests('tag', ['ſtyle', 'ſtyle', 'style', 'ıframe', 'ſcript',
'ı', 'KBD', 'ſMALL', 'stRONG'])
def test_invalid_nonascii_tag(self, tag):
# Starts with non-ASCII letter
source = f"<{tag}><br></{tag}>"
self._run_check(source, [
("data", f"<{tag}>"),
("starttag", "br", []),
("comment", f"{tag}"),
], collector=EventCollector(convert_charrefs=False, scripting=True))

@support.subTests('tail,end', [
('', False),
('<', False),
Expand Down Expand Up @@ -1068,7 +1099,7 @@ def test_attr_values(self):
"<a href=mailto:xyz@example.com>",
[("starttag", "a", [("href", "mailto:xyz@example.com")])])

def test_attr_nonascii(self):
def test_attr_value_nonascii(self):
# see issue 7311
self._run_check(
"<img src=/foo/bar.png alt=\u4e2d\u6587>",
Expand All @@ -1083,6 +1114,16 @@ def test_attr_nonascii(self):
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")])])

def test_attr_name_nonascii(self):
self._run_check(
'<BUTTON ACCEßKEY="s">',
[('starttag', 'button', [('acceßKey', 's')])])
self._run_check(
'<TRACK KIND="chapters" ſRC="sampleChapters.vtt" SRCLANG="en" />',
[('startendtag', 'track', [('Kind', 'chapters'),
('ſrc', 'sampleChapters.vtt'),
('srclang', 'en')])])

def test_attr_entity_replacement(self):
self._run_check(
"<a b='&amp;&gt;&lt;&quot;&apos;'>",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix handling non-ASCII element and attribute names in
:class:`html.parser.HTMLParser`. Only ASCII letters are now converted to
lower case.
Loading