Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

More stuff orking including treewalkers, parts of parse.py dom, (c)El…

…ementTree

--HG--
branch : svgmathml
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401266
  • Loading branch information...
commit aa58129690b28d5e97ec68f63230e26fde9d7ac1 1 parent 495b92b
jgraham.html authored
Showing with 303 additions and 264 deletions.
  1. +22 −16 parse.py
  2. +2 −3 src/html5lib/__init__.py
  3. +12 −12 src/html5lib/filters/formfiller.py
  4. +3 −3 src/html5lib/filters/inject_meta_charset.py
  5. +1 −1  src/html5lib/filters/iso639codes.py
  6. +21 −21 src/html5lib/filters/lint.py
  7. +1 −1  src/html5lib/filters/optionaltags.py
  8. +1 −1  src/html5lib/filters/sanitizer.py
  9. +38 −38 src/html5lib/filters/validator.py
  10. +4 −4 src/html5lib/filters/whitespace.py
  11. +3 −2 src/html5lib/html5parser.py
  12. +3 −3 src/html5lib/ihatexml.py
  13. +7 −3 src/html5lib/inputstream.py
  14. +2 −2 src/html5lib/serializer/__init__.py
  15. +18 −15 src/html5lib/serializer/htmlserializer.py
  16. +1 −1  src/html5lib/serializer/xhtmlserializer.py
  17. +8 −2 src/html5lib/treebuilders/__init__.py
  18. +17 −17 src/html5lib/treebuilders/dom.py
  19. +15 −13 src/html5lib/treebuilders/etree.py
  20. +8 −8 src/html5lib/treebuilders/simpletree.py
  21. +10 −4 src/html5lib/treewalkers/__init__.py
  22. +9 −9 src/html5lib/treewalkers/_base.py
  23. +2 −2 src/html5lib/treewalkers/dom.py
  24. +2 −2 src/html5lib/treewalkers/etree.py
  25. +3 −3 src/html5lib/treewalkers/pulldom.py
  26. +2 −2 src/html5lib/treewalkers/simpletree.py
  27. +2 −2 src/html5lib/utils.py
  28. +9 −9 tests/support.py
  29. +24 −20 tests/test_parser.py
  30. +21 −20 tests/test_tokenizer.py
  31. +32 −25 tests/test_treewalkers.py
View
38 parse.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3.0
"""usage: %prog [options] filename
Parse a document to a simpletree tree, with optional profiling
@@ -9,11 +9,16 @@
import os
from optparse import OptionParser
+print(sys.stdout.encoding)
+
#RELEASE remove
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
#END RELEASE
-from html5lib import html5parser, liberalxmlparser, sanitizer
+print(sys.path)
+import html5lib
+import html5lib.html5parser as html5parser
from html5lib.tokenizer import HTMLTokenizer
+from html5lib import treebuilders
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants
@@ -27,8 +32,8 @@ def parse():
# Try opening from the internet
if f.startswith('http://'):
try:
- import urllib, cgi
- f = urllib.urlopen(f)
+ from urllib import request
+ f = request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
(mediaType, params) = cgi.parse_header(contentType)
@@ -39,7 +44,7 @@ def parse():
else:
try:
# Try opening from file system
- f = open(f)
+ f = open(f, "rb")
except IOError: pass
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
@@ -64,16 +69,16 @@ def parse():
if opts.profile:
#XXX should import cProfile instead and use that
- import hotshot
- import hotshot.stats
- prof = hotshot.Profile('stats.prof')
- prof.runcall(parseMethod, f, encoding=encoding)
+ try:
+ import cProfile as profile
+ except ImportError:
+ import profile
+ import pstats
+ prof = profile.run('parseMethod(f, encoding=encoding)', 'prof.out')
prof.close()
# XXX - We should use a temp file here
- stats = hotshot.stats.load('stats.prof')
- stats.strip_dirs()
- stats.sort_stats('time')
- stats.print_stats()
+ stats = pstats.stats('prof.out')
+ stats.strip_dirs().sort_stats('time').print_stats()
elif opts.time:
import time
t0 = time.time()
@@ -88,13 +93,14 @@ def parse():
def printOutput(parser, document, opts):
if opts.encoding:
- print "Encoding:", parser.tokenizer.stream.charEncoding
+ print("Encoding:", parser.tokenizer.stream.charEncoding)
if opts.xml:
sys.stdout.write(document.toxml("utf-8"))
elif opts.tree:
if not hasattr(document,'__getitem__'): document = [document]
for fragment in document:
- print parser.tree.testSerializer(fragment).encode("utf-8")
+ sys.stdout.write(parser.tree.testSerializer(fragment))
+ sys.stdout.write("\n")
elif opts.hilite:
sys.stdout.write(document.hilite("utf-8"))
elif opts.html:
@@ -103,7 +109,7 @@ def printOutput(parser, document, opts):
kwargs[opt] = getattr(opts,opt)
if not kwargs['quote_char']: del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
- for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding='utf-8'):
+ for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if opts.error:
View
5 src/html5lib/__init__.py
@@ -10,10 +10,9 @@
f = open("my_document.html")
tree = html5lib.parse(f)
"""
-print(__path__)
-#from .html5parser import HTMLParser, parse
-#from treebuilders import getTreeBuilder
+from .html5parser import HTMLParser, parse
+from .treebuilders import getTreeBuilder
#from .liberalxmlparser import XMLParser, XHTMLParser
View
24 src/html5lib/filters/formfiller.py
@@ -4,10 +4,10 @@
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
#
-import _base
+from . import _base
from html5lib.constants import spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
class SimpleFilter(_base.Filter):
def __init__(self, source, fieldStorage):
@@ -29,13 +29,13 @@ def __iter__(self):
input_checked_index = -1
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
- if n == u"name":
+ if n == "name":
field_name = v.strip(spaceCharacters)
- elif n == u"type":
+ elif n == "type":
field_type = v.strip(spaceCharacters)
- elif n == u"checked":
+ elif n == "checked":
input_checked_index = i
- elif n == u"value":
+ elif n == "value":
input_value_index = i
value_list = self.fieldStorage.getlist(field_name)
@@ -45,20 +45,20 @@ def __iter__(self):
else:
value = ""
- if field_type in (u"checkbox", u"radio"):
+ if field_type in ("checkbox", "radio"):
if value_list:
if token["data"][input_value_index][1] == value:
if input_checked_index < 0:
- token["data"].append((u"checked", u""))
+ token["data"].append(("checked", ""))
field_indices[field_name] = field_index + 1
elif input_checked_index >= 0:
del token["data"][input_checked_index]
- elif field_type not in (u"button", u"submit", u"reset"):
+ elif field_type not in ("button", "submit", "reset"):
if input_value_index >= 0:
- token["data"][input_value_index] = (u"value", value)
+ token["data"][input_value_index] = ("value", value)
else:
- token["data"].append((u"value", value))
+ token["data"].append(("value", value))
field_indices[field_name] = field_index + 1
field_type = None
@@ -96,7 +96,7 @@ def __iter__(self):
value = ""
if (is_select_multiple or not is_selected_option_found) and option_value == value:
if option_selected_index < 0:
- token["data"].append((u"selected", u""))
+ token["data"].append(("selected", ""))
field_indices[field_name] = field_index + 1
is_selected_option_found = True
elif option_selected_index >= 0:
View
6 src/html5lib/filters/inject_meta_charset.py
@@ -1,4 +1,4 @@
-import _base
+from . import _base
class Filter(_base.Filter):
def __init__(self, source, encoding):
@@ -23,7 +23,7 @@ def __iter__(self):
content_index = -1
for i,(name,value) in enumerate(token["data"]):
if name.lower() == 'charset':
- token["data"][i] = (u'charset', self.encoding)
+ token["data"][i] = ('charset', self.encoding)
meta_found = True
break
elif name == 'http-equiv' and value.lower() == 'content-type':
@@ -32,7 +32,7 @@ def __iter__(self):
content_index = i
else:
if has_http_equiv_content_type and content_index >= 0:
- token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
+ token["data"][content_index] = ('content', 'text/html; charset=%s' % self.encoding)
meta_found = True
elif token["name"].lower() == "head" and not meta_found:
View
2  src/html5lib/filters/iso639codes.py
@@ -746,4 +746,4 @@ def isValidLangCode(value):
lang, sublang = value.split('-', 1)
else:
lang = value
- return isoLang.has_key(unicode.lower(unicode(lang)))
+ return str.lower(str(lang)) in isoLang
View
42 src/html5lib/filters/lint.py
@@ -1,11 +1,11 @@
from gettext import gettext
_ = gettext
-import _base
+from . import _base
from html5lib.constants import cdataElements, rcdataElements, voidElements
from html5lib.constants import spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
class LintError(Exception): pass
@@ -19,22 +19,22 @@ def __iter__(self):
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
- if not isinstance(name, unicode):
- raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
if not name:
- raise LintError(_(u"Empty tag name"))
+ raise LintError(_("Empty tag name"))
if type == "StartTag" and name in voidElements:
- raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+ raise LintError(_("Void element reported as StartTag token: %s") % name)
elif type == "EmptyTag" and name not in voidElements:
- raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+ raise LintError(_("Non-void element reported as EmptyTag token: %s") % token["name"])
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
- if not isinstance(name, unicode):
+ if not isinstance(name, str):
raise LintError(_("Attribute name is not a string: %r") % name)
if not name:
- raise LintError(_(u"Empty attribute name"))
- if not isinstance(value, unicode):
+ raise LintError(_("Empty attribute name"))
+ if not isinstance(value, str):
raise LintError(_("Attribute value is not a string: %r") % value)
if name in cdataElements:
contentModelFlag = "CDATA"
@@ -45,15 +45,15 @@ def __iter__(self):
elif type == "EndTag":
name = token["name"]
- if not isinstance(name, unicode):
- raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
if not name:
- raise LintError(_(u"Empty tag name"))
+ raise LintError(_("Empty tag name"))
if name in voidElements:
- raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+ raise LintError(_("Void element reported as EndTag token: %s") % name)
start_name = open_elements.pop()
if start_name != name:
- raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+ raise LintError(_("EndTag (%s) does not match StartTag (%s)") % (name, start_name))
contentModelFlag = "PCDATA"
elif type == "Comment":
@@ -62,27 +62,27 @@ def __iter__(self):
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
- if not isinstance(data, unicode):
+ if not isinstance(data, str):
raise LintError(_("Attribute name is not a string: %r") % data)
if not data:
- raise LintError(_(u"%s token with empty data") % type)
+ raise LintError(_("%s token with empty data") % type)
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
- raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+ raise LintError(_("Non-space character(s) found in SpaceCharacters token: ") % data)
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
- if not isinstance(name, unicode):
- raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
# XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"):
pass
else:
- raise LintError(_(u"Unknown token type: %s") % type)
+ raise LintError(_("Unknown token type: %s") % type)
yield token
View
2  src/html5lib/filters/optionaltags.py
@@ -1,4 +1,4 @@
-import _base
+from . import _base
class Filter(_base.Filter):
def slider(self):
View
2  src/html5lib/filters/sanitizer.py
@@ -1,4 +1,4 @@
-import _base
+from . import _base
from html5lib.sanitizer import HTMLSanitizerMixin
class Filter(_base.Filter, HTMLSanitizerMixin):
View
76 src/html5lib/filters/validator.py
@@ -18,10 +18,10 @@
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
-import _base
-import iso639codes
-import rfc3987
-import rfc2046
+from . import _base
+from . import iso639codes
+from . import rfc3987
+from . import rfc2046
from html5lib.constants import E, spaceCharacters, digits, tokenTypes
from html5lib import tokenizer
import gettext
@@ -29,61 +29,61 @@
E.update({
"unknown-start-tag":
- _(u"Unknown start tag <%(tagName)s>."),
+ _("Unknown start tag <%(tagName)s>."),
"unknown-attribute":
- _(u"Unknown '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Unknown '%(attributeName)s' attribute on <%(tagName)s>."),
"missing-required-attribute":
- _(u"The '%(attributeName)s' attribute is required on <%(tagName)s>."),
+ _("The '%(attributeName)s' attribute is required on <%(tagName)s>."),
"unknown-input-type":
- _(u"Illegal value for attribute on <input type='%(inputType)s'>."),
+ _("Illegal value for attribute on <input type='%(inputType)s'>."),
"attribute-not-allowed-on-this-input-type":
- _(u"The '%(attributeName)s' attribute is not allowed on <input type=%(inputType)s>."),
+ _("The '%(attributeName)s' attribute is not allowed on <input type=%(inputType)s>."),
"deprecated-attribute":
- _(u"This attribute is deprecated: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("This attribute is deprecated: '%(attributeName)s' attribute on <%(tagName)s>."),
"duplicate-value-in-token-list":
- _(u"Duplicate value in token list: '%(attributeValue)s' in '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Duplicate value in token list: '%(attributeValue)s' in '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-attribute-value":
- _(u"Invalid attribute value: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Invalid attribute value: '%(attributeName)s' attribute on <%(tagName)s>."),
"space-in-id":
- _(u"Whitespace is not allowed here: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Whitespace is not allowed here: '%(attributeName)s' attribute on <%(tagName)s>."),
"duplicate-id":
- _(u"This ID was already defined earlier: 'id' attribute on <%(tagName)s>."),
+ _("This ID was already defined earlier: 'id' attribute on <%(tagName)s>."),
"attribute-value-can-not-be-blank":
- _(u"This value can not be blank: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("This value can not be blank: '%(attributeName)s' attribute on <%(tagName)s>."),
"id-does-not-exist":
- _(u"This value refers to a non-existent ID: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("This value refers to a non-existent ID: '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-enumerated-value":
- _(u"Value must be one of %(enumeratedValues)s: '%(attributeName)s' attribute on <%tagName)s>."),
+ _("Value must be one of %(enumeratedValues)s: '%(attributeName)s' attribute on <%tagName)s>."),
"invalid-boolean-value":
- _(u"Value must be one of %(enumeratedValues)s: '%(attributeName)s' attribute on <%tagName)s>."),
+ _("Value must be one of %(enumeratedValues)s: '%(attributeName)s' attribute on <%tagName)s>."),
"contextmenu-must-point-to-menu":
- _(u"The contextmenu attribute must point to an ID defined on a <menu> element."),
+ _("The contextmenu attribute must point to an ID defined on a <menu> element."),
"invalid-lang-code":
- _(u"Invalid language code: '%(attributeName)s' attibute on <%(tagName)s>."),
+ _("Invalid language code: '%(attributeName)s' attibute on <%(tagName)s>."),
"invalid-integer-value":
- _(u"Value must be an integer: '%(attributeName)s' attribute on <%tagName)s>."),
+ _("Value must be an integer: '%(attributeName)s' attribute on <%tagName)s>."),
"invalid-root-namespace":
- _(u"Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."),
+ _("Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."),
"invalid-browsing-context":
- _(u"Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_': '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_': '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-tag-uri":
- _(u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-urn":
- _(u"Invalid URN: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Invalid URN: '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-uri-char":
- _(u"Illegal character in URI: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Illegal character in URI: '%(attributeName)s' attribute on <%(tagName)s>."),
"uri-not-iri":
- _(u"Expected a URI but found an IRI: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Expected a URI but found an IRI: '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-uri":
- _(u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-http-or-ftp-uri":
- _(u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-scheme":
- _(u"Unregistered URI scheme: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Unregistered URI scheme: '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-rel":
- _(u"Invalid link relation: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Invalid link relation: '%(attributeName)s' attribute on <%(tagName)s>."),
"invalid-mime-type":
- _(u"Invalid MIME type: '%(attributeName)s' attribute on <%(tagName)s>."),
+ _("Invalid MIME type: '%(attributeName)s' attribute on <%(tagName)s>."),
})
globalAttributes = frozenset(('class', 'contenteditable', 'contextmenu', 'dir',
@@ -267,7 +267,7 @@ def __init__(self, stream, encoding, parseMeta, **kwargs):
self.IDsWeHaveKnownAndLoved = []
def __iter__(self):
- types = dict((v,k) for k,v in tokenTypes.iteritems())
+ types = dict((v,k) for k,v in tokenTypes.items())
for token in _base.Filter.__iter__(self):
fakeToken = {"type": types.get(token.get("type", "-"), "-"),
"name": token.get("name", "-").capitalize()}
@@ -301,12 +301,12 @@ def validateStartTagInput(self, token):
for t in self.checkAttributeValues(token) or []: yield t
attrDict = dict([(name.lower(), value) for name, value in token.get("data", [])])
inputType = attrDict.get("type", "text")
- if inputType not in inputTypeAllowedAttributeMap.keys():
+ if inputType not in list(inputTypeAllowedAttributeMap.keys()):
yield {"type": tokenTypes["ParseError"],
"data": "unknown-input-type",
"datavars": {"attrValue": inputType}}
allowedAttributes = inputTypeAllowedAttributeMap.get(inputType, [])
- for attrName, attrValue in attrDict.items():
+ for attrName, attrValue in list(attrDict.items()):
if attrName not in allowedAttributeMap['input']:
yield {"type": tokenTypes["ParseError"],
"data": "unknown-attribute",
@@ -330,7 +330,7 @@ def validateStartTagInput(self, token):
def checkUnknownStartTag(self, token):
# check for recognized tag name
name = token.get("name", "").lower()
- if name not in allowedAttributeMap.keys():
+ if name not in list(allowedAttributeMap.keys()):
yield {"type": tokenTypes["ParseError"],
"data": "unknown-start-tag",
"datavars": {"tagName": name}}
@@ -338,7 +338,7 @@ def checkUnknownStartTag(self, token):
def checkStartTagRequiredAttributes(self, token):
# check for presence of required attributes
name = token.get("name", "").lower()
- if name in requiredAttributeMap.keys():
+ if name in list(requiredAttributeMap.keys()):
attrsPresent = [attrName for attrName, attrValue
in token.get("data", [])]
for attrName in requiredAttributeMap[name]:
@@ -427,7 +427,7 @@ def checkTokenList(self, tagName, attrName, attrValue):
valueList = self.parseTokenList(attrValue)
valueDict = {}
for currentValue in valueList:
- if valueDict.has_key(currentValue):
+ if currentValue in valueDict:
yield {"type": tokenTypes["ParseError"],
"data": "duplicate-value-in-token-list",
"datavars": {"tagName": tagName,
View
8 src/html5lib/filters/whitespace.py
@@ -6,11 +6,11 @@
import re
-import _base
+from . import _base
from html5lib.constants import rcdataElements, spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
-SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
+SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
class Filter(_base.Filter):
@@ -29,7 +29,7 @@ def __iter__(self):
elif not preserve and type == "SpaceCharacters" and token["data"]:
# Test on token["data"] above to not introduce spaces where there were not
- token["data"] = u" "
+ token["data"] = " "
elif not preserve and type == "Characters":
token["data"] = collapse_spaces(token["data"])
View
5 src/html5lib/html5parser.py
@@ -15,8 +15,9 @@
from .constants import cdataElements, rcdataElements, voidElements
from .constants import tokenTypes
-def parse(doc, treebuilderName="simpletree", encoding=None):
- tb = treebuilders.getTreeBuilder(treebuilderName)
+def parse(doc, treebuilderName="simpletree", encoding=None, implementation=None):
+ tb = treebuilders.getTreeBuilder(treebuilderName,
+ implementation=implementation)
p = HTMLParser(tb)
return p.parse(doc, encoding=encoding)
View
6 src/html5lib/ihatexml.py
@@ -90,12 +90,12 @@ def escapeRegexp(string):
for char in specialCharacters:
string = string.replace(char, r"\\" + char)
if char in string:
- print string
+ print(string)
return string
#output from the above
-nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
+nonXmlBMPRegexp = re.compile('[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
@@ -167,4 +167,4 @@ def escapeChar(self, char):
return replacement
def unescapeChar(self, charcode):
- return unichr(int(charcode[1:], 16))
+ return chr(int(charcode[1:], 16))
View
10 src/html5lib/inputstream.py
@@ -171,7 +171,12 @@ def openStream(self, source):
"""
# Already a file object
if hasattr(source, 'read'):
- if 'b' in f.mode:
+ #This is wrong. We need a generic way to tell the difference
+ #between file-like objects that produce strings and those that
+ #produce bytes. We also need a good way to deal with the ones
+ #that produce strings, in particular getting the replacement
+ #characters right.
+ if not hasattr(source, 'encoding'):
stream = source
else:
raise NotImplementedError("Files not opened in binary mode not yet supported")
@@ -448,8 +453,7 @@ class EncodingBytes(bytes):
If the position is ever greater than the string length then an exception is
raised"""
def __init__(self, value):
- bytes.__init__(self, value)
- self._position=-1
+ self._position = -1
def __iter__(self):
return self
View
4 src/html5lib/serializer/__init__.py
@@ -1,3 +1,3 @@
-from htmlserializer import HTMLSerializer
-from xhtmlserializer import XHTMLSerializer
+from .htmlserializer import HTMLSerializer
+from .xhtmlserializer import XHTMLSerializer
View
33 src/html5lib/serializer/htmlserializer.py
@@ -12,7 +12,7 @@
from xml.sax.saxutils import escape
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
try:
from codecs import register_error, xmlcharrefreplace_errors
@@ -24,7 +24,7 @@
from html5lib.constants import entities
encode_entity_map = {}
- for k, v in entities.items():
+ for k, v in list(entities.items()):
if v != "&" and encode_entity_map.get(v) != k.lower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
@@ -41,7 +41,7 @@ def htmlentityreplace_errors(exc):
res.append(";")
else:
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
- return (u"".join(res), exc.end)
+ return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
@@ -76,7 +76,7 @@ class HTMLSerializer(object):
"escape_rcdata", 'use_trailing_solidus', "sanitize")
def __init__(self, **kwargs):
- if kwargs.has_key('quote_char'):
+ if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
@@ -103,7 +103,7 @@ def serialize(self, treewalker, encoding=None):
for token in treewalker:
type = token["type"]
if type == "Doctype":
- doctype = u"<!DOCTYPE %s>" % token["name"]
+ doctype = "<!DOCTYPE %s>" % token["name"]
if encoding:
yield doctype.encode(encoding)
else:
@@ -130,7 +130,7 @@ def serialize(self, treewalker, encoding=None):
self.serializeError(_("Unexpected child element of a CDATA element"))
attrs = token["data"]
if hasattr(attrs, "items"):
- attrs = attrs.items()
+ attrs = list(attrs.items())
attrs.sort()
attributes = []
for k,v in attrs:
@@ -139,15 +139,18 @@ def serialize(self, treewalker, encoding=None):
attributes.append(' ')
attributes.append(k)
- if not self.minimize_boolean_attributes or \
- (k not in booleanAttributes.get(name, tuple()) \
- and k not in booleanAttributes.get("", tuple())):
+ if (not self.minimize_boolean_attributes or
+ (k not in booleanAttributes.get(name, tuple())
+ and k not in booleanAttributes.get("", tuple()))):
attributes.append("=")
if self.quote_attr_values or not v:
quote_attr = True
else:
- quote_attr = reduce(lambda x,y: x or (y in v),
- spaceCharacters + ">\"'=", False)
+ quote_attr = False
+ for char in spaceCharacters + ">\"'=":
+ if char in v:
+ quote_attr = True
+ break
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
if encoding:
@@ -176,7 +179,7 @@ def serialize(self, treewalker, encoding=None):
if encoding:
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
else:
- yield u"<%s%s>" % (name, u"".join(attributes))
+ yield "<%s%s>" % (name, "".join(attributes))
elif type == "EndTag":
name = token["name"]
@@ -184,7 +187,7 @@ def serialize(self, treewalker, encoding=None):
in_cdata = False
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
- end_tag = u"</%s>" % name
+ end_tag = "</%s>" % name
if encoding:
end_tag = end_tag.encode(encoding, "strict")
yield end_tag
@@ -193,7 +196,7 @@ def serialize(self, treewalker, encoding=None):
data = token["data"]
if data.find("--") >= 0:
self.serializeError(_("Comment contains --"))
- comment = u"<!--%s-->" % token["data"]
+ comment = "<!--%s-->" % token["data"]
if encoding:
comment = comment.encode(encoding, unicode_encode_errors)
yield comment
@@ -205,7 +208,7 @@ def render(self, treewalker, encoding=None):
if encoding:
return "".join(list(self.serialize(treewalker, encoding)))
else:
- return u"".join(list(self.serialize(treewalker)))
+ return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
View
2  src/html5lib/serializer/xhtmlserializer.py
@@ -1,4 +1,4 @@
-from htmlserializer import HTMLSerializer
+from .htmlserializer import HTMLSerializer
class XHTMLSerializer(HTMLSerializer):
quote_attr_values = True
View
10 src/html5lib/treebuilders/__init__.py
@@ -56,7 +56,7 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
treeType = treeType.lower()
if treeType not in treeBuilderCache:
if treeType == "dom":
- import dom
+ from . import dom
# XXX: Keep backwards compatibility by using minidom if no implementation is given
if implementation == None:
from xml.dom import minidom
@@ -73,7 +73,13 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree":
- import etree
+ if implementation is None:
+ try:
+ import xml.etree.cElementTree as etree
+ except:
+ import xml.etree.ElementTree as etree
+ implementation = etree
+ from . import etree
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
return treeBuilderCache.get(treeType)
View
34 src/html5lib/treebuilders/dom.py
@@ -1,6 +1,6 @@
-import _base
+from . import _base
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
-import new
+import types
import re
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@@ -12,7 +12,7 @@ def getDomModule(DomImplementation):
if name in moduleCache:
return moduleCache[name]
else:
- mod = new.module(name)
+ mod = types.ModuleType(name)
objs = getDomBuilder(DomImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
@@ -24,14 +24,14 @@ class AttrList:
def __init__(self, element):
self.element = element
def __iter__(self):
- return self.element.attributes.items().__iter__()
+ return list(self.element.attributes.items()).__iter__()
def __setitem__(self, name, value):
- value=illegal_xml_chars.sub(u'\uFFFD',value)
+ value=illegal_xml_chars.sub('\uFFFD',value)
self.element.setAttribute(name, value)
def items(self):
- return self.element.attributes.items()
+ return list(self.element.attributes.items())
def keys(self):
- return self.element.attributes.keys()
+ return list(self.element.attributes.keys())
def __getitem__(self, name):
return self.element.getAttribute(name)
@@ -49,7 +49,7 @@ def appendChild(self, node):
self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None):
- data=illegal_xml_chars.sub(u'\uFFFD',data)
+ data=illegal_xml_chars.sub('\uFFFD',data)
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
@@ -77,8 +77,8 @@ def getAttributes(self):
def setAttributes(self, attributes):
if attributes:
- for name, value in attributes.items():
- value=illegal_xml_chars.sub(u'\uFFFD',value)
+ for name, value in list(attributes.items()):
+ value=illegal_xml_chars.sub('\uFFFD',value)
self.element.setAttribute(name, value)
attributes = property(getAttributes, setAttributes)
@@ -132,8 +132,8 @@ def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
- data=illegal_xml_chars.sub(u'\uFFFD',data)
- if parent <> self:
+ data=illegal_xml_chars.sub('\uFFFD',data)
+ if parent != self:
_base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
@@ -171,7 +171,7 @@ def serializeElement(element, indent=0):
else:
rv.append("|%s<%s>"%(' '*indent, element.nodeName))
if element.hasAttributes():
- for name, value in element.attributes.items():
+ for name, value in list(element.attributes.items()):
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
for child in element.childNodes:
@@ -191,7 +191,7 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
# gather namespace declarations
prefixes = []
- for attrname in node.attributes.keys():
+ for attrname in list(node.attributes.keys()):
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
@@ -203,11 +203,11 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
del attributes[(attr.namespaceURI, attr.localName)]
# apply namespace declarations
- for attrname in node.attributes.keys():
+ for attrname in list(node.attributes.keys()):
attr = node.getAttributeNode(attrname)
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
- if nsmap.has_key(prefix):
+ if prefix in nsmap:
del attributes[(attr.namespaceURI, attr.localName)]
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
@@ -241,5 +241,5 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
return locals()
# XXX: Keep backwards compatibility with things that directly load classes/functions from this module
-for key, value in getDomModule(minidom).__dict__.items():
+for key, value in list(getDomModule(minidom).__dict__.items()):
globals()[key] = value
View
28 src/html5lib/treebuilders/etree.py
@@ -1,5 +1,5 @@
-import _base
-import new
+from . import _base
+import types
from html5lib import ihatexml
@@ -10,7 +10,7 @@ def getETreeModule(ElementTreeImplementation, fullTree=False):
if name in moduleCache:
return moduleCache[name]
else:
- mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
+ mod = types.ModuleType("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
mod.__dict__.update(objs)
moduleCache[name] = mod
@@ -45,17 +45,19 @@ def _getAttributes(self):
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
- for key in self._element.attrib.keys():
+ for key in list(self._element.attrib.keys()):
del self._element.attrib[key]
- for key, value in attributes.iteritems():
+ for key, value in attributes.items():
self._element.set(key, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
+
def _setChildNodes(self, value):
- del self._element[:]
+ while len(self._element):
+ del self._element[0]
self._childNodes = []
for element in value:
self.insertChild(element)
@@ -105,7 +107,7 @@ def insertText(self, data, insertBefore=None):
def cloneNode(self):
element = Element(self.name)
- for name, value in self.attributes.iteritems():
+ for name, value in self.attributes.items():
element.attributes[name] = value
return element
@@ -145,20 +147,20 @@ def __init__(self, name, publicId, systemId):
self.systemId = systemId
def _getPublicId(self):
- return self._element.get(u"publicId", "")
+ return self._element.get("publicId", "")
def _setPublicId(self, value):
if value is not None:
- self._element.set(u"publicId", value)
+ self._element.set("publicId", value)
publicId = property(_getPublicId, _setPublicId)
def _getSystemId(self):
- return self._element.get(u"systemId", "")
+ return self._element.get("systemId", "")
def _setSystemId(self, value):
if value is not None:
- self._element.set(u"systemId", value)
+ self._element.set("systemId", value)
systemId = property(_getSystemId, _setSystemId)
@@ -195,7 +197,7 @@ def serializeElement(element, indent=0):
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
if hasattr(element, "attrib"):
- for name, value in element.attrib.iteritems():
+ for name, value in element.attrib.items():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
@@ -246,7 +248,7 @@ def serializeElement(element):
else:
attr = " ".join(["%s=\"%s\""%(
filter.fromXmlName(name), value)
- for name, value in element.attrib.iteritems()])
+ for name, value in element.attrib.items()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
View
16 src/html5lib/treebuilders/simpletree.py
@@ -18,7 +18,7 @@ def __iter__(self):
for item in node:
yield item
- def __unicode__(self):
+ def __str__(self):
return self.name
def toxml(self):
@@ -80,7 +80,7 @@ class Document(Node):
def __init__(self):
Node.__init__(self, None)
- def __unicode__(self):
+ def __str__(self):
return "#document"
def appendChild(self, child):
@@ -106,7 +106,7 @@ def printTree(self):
class DocumentFragment(Document):
type = 2
- def __unicode__(self):
+ def __str__(self):
return "#document-fragment"
class DocumentType(Node):
@@ -116,7 +116,7 @@ def __init__(self, name, publicId, systemId):
self.publicId = publicId
self.systemId = systemId
- def __unicode__(self):
+ def __str__(self):
if self.publicId or self.systemId:
publicId = self.publicId or ""
systemId = self.systemId or ""
@@ -127,7 +127,7 @@ def __unicode__(self):
return "<!DOCTYPE %s>" % self.name
- toxml = __unicode__
+ toxml = __str__
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
@@ -138,7 +138,7 @@ def __init__(self, value):
Node.__init__(self, None)
self.value = value
- def __unicode__(self):
+ def __str__(self):
return "\"%s\"" % self.value
def toxml(self):
@@ -153,7 +153,7 @@ def __init__(self, name, namespace=None):
self.namespace = namespace
self.attributes = {}
- def __unicode__(self):
+ def __str__(self):
return "<%s>" % self.name
def toxml(self):
@@ -199,7 +199,7 @@ def __init__(self, data):
Node.__init__(self, None)
self.data = data
- def __unicode__(self):
+ def __str__(self):
return "<!-- %s -->" % self.data
def toxml(self):
View
14 src/html5lib/treewalkers/__init__.py
@@ -37,16 +37,22 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
mod = __import__(treeType, globals())
treeWalkerCache[treeType] = mod.TreeWalker
elif treeType == "genshi":
- import genshistream
+ from . import genshistream
treeWalkerCache[treeType] = genshistream.TreeWalker
elif treeType == "beautifulsoup":
- import soup
+ from . import soup
treeWalkerCache[treeType] = soup.TreeWalker
elif treeType == "lxml":
- import lxmletree
+ from . import lxmletree
treeWalkerCache[treeType] = lxmletree.TreeWalker
elif treeType == "etree":
- import etree
+ from . import etree
+ if implementation is None:
+ try:
+ from xml.etree import cElementTree as etree
+ except:
+ from xml.etree import ElementTree as etree
+ implementation = etree
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType)
View
18 src/html5lib/treewalkers/_base.py
@@ -2,7 +2,7 @@
_ = gettext.gettext
from html5lib.constants import voidElements, spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
class TreeWalker(object):
def __init__(self, tree):
@@ -18,24 +18,24 @@ def normalizeAttrs(self, attrs):
if not attrs:
attrs = []
elif hasattr(attrs, 'items'):
- attrs = attrs.items()
- return [(unicode(name),unicode(value)) for name,value in attrs]
+ attrs = list(attrs.items())
+ return [(str(name),str(value)) for name,value in attrs]
def emptyTag(self, name, attrs, hasChildren=False):
- yield {"type": "EmptyTag", "name": unicode(name), \
+ yield {"type": "EmptyTag", "name": str(name), \
"data": self.normalizeAttrs(attrs)}
if hasChildren:
yield self.error(_("Void element has children"))
def startTag(self, name, attrs):
- return {"type": "StartTag", "name": unicode(name), \
+ return {"type": "StartTag", "name": str(name), \
"data": self.normalizeAttrs(attrs)}
def endTag(self, name):
- return {"type": "EndTag", "name": unicode(name), "data": []}
+ return {"type": "EndTag", "name": str(name), "data": []}
def text(self, data):
- data = unicode(data)
+ data = str(data)
middle = data.lstrip(spaceCharacters)
left = data[:len(data)-len(middle)]
if left:
@@ -49,11 +49,11 @@ def text(self, data):
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
- return {"type": "Comment", "data": unicode(data)}
+ return {"type": "Comment", "data": str(data)}
def doctype(self, name, publicId=None, systemId=None, correct=True):
return {"type": "Doctype",
- "name": name is not None and unicode(name) or u"",
+ "name": name is not None and str(name) or "",
"publicId": publicId, "systemId": systemId,
"correct": correct}
View
4 src/html5lib/treewalkers/dom.py
@@ -3,7 +3,7 @@
import gettext
_ = gettext.gettext
-import _base
+from . import _base
from html5lib.constants import voidElements
@@ -16,7 +16,7 @@ def getNodeDetails(self, node):
return _base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
- return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
+ return _base.ELEMENT, node.nodeName, list(node.attributes.items()), node.hasChildNodes
elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue
View
4 src/html5lib/treewalkers/etree.py
@@ -4,7 +4,7 @@
import new
import copy
-import _base
+from . import _base
from html5lib.constants import voidElements
moduleCache = {}
@@ -60,7 +60,7 @@ def getNodeDetails(self, node):
else:
#This is assumed to be an ordinary element
- return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
+ return _base.ELEMENT, node.tag, list(node.attrib.items()), len(node) or node.text
def getFirstChild(self, node):
if isinstance(node, tuple):
View
6 src/html5lib/treewalkers/pulldom.py
@@ -1,7 +1,7 @@
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
-import _base
+from . import _base
from html5lib.constants import voidElements
@@ -31,10 +31,10 @@ def tokens(self, event, next):
name = node.nodeName
if name in voidElements:
for token in self.emptyTag(name, \
- node.attributes.items(), not next or next[1] is not node):
+ list(node.attributes.items()), not next or next[1] is not node):
yield token
else:
- yield self.startTag(name, node.attributes.items())
+ yield self.startTag(name, list(node.attributes.items()))
elif type == END_ELEMENT:
name = node.nodeName
View
4 src/html5lib/treewalkers/simpletree.py
@@ -1,7 +1,7 @@
import gettext
_ = gettext.gettext
-import _base
+from . import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given that simpletree has no performant way of getting a node's
@@ -33,7 +33,7 @@ def getNodeDetails(self, node):
elif node.type == 5: # Element
return _base.ELEMENT, node.name, \
- node.attributes.items(), node.hasContent()
+ list(node.attributes.items()), node.hasContent()
elif node.type == 6: # CommentNode
return _base.COMMENT, node.data
View
4 src/html5lib/utils.py
@@ -90,7 +90,7 @@ def extendleft(self, iterable):
def rotate(self, n=1):
if self:
n %= len(self)
- for i in xrange(n):
+ for i in range(n):
self.appendleft(self.pop())
def __getitem__(self, i):
@@ -116,7 +116,7 @@ def __delitem__(self, i):
data = self.data
if i < 0:
i += size
- for j in xrange(self.left+i, self.right-1):
+ for j in range(self.left+i, self.right-1):
data[j] = data[j+1]
self.pop()
View
18 tests/support.py
@@ -16,11 +16,11 @@
#RELEASE add
#test_dir = './testdata'
#END RELEASE
-import simplejson
+import json as simplejson
#Build a dict of avaliable trees
-treeTypes = {"simpletree":treebuilders.getTreeBuilder("simpletree"),
- "DOM":treebuilders.getTreeBuilder("dom")}
+treeTypes = {"simpletree":treebuilders.getTreeBuilder("simpletree")}
+ #"DOM":treebuilders.getTreeBuilder("dom")}
#Try whatever etree implementations are avaliable from a list that are
#"supposed" to work
@@ -69,7 +69,7 @@ def __getitem__(self, key):
class TestData(object):
def __init__(self, filename, newTestHeading="data"):
- self.f = open(filename)
+ self.f = open(filename, "rb")
self.newTestHeading = newTestHeading
def __iter__(self):
@@ -84,7 +84,7 @@ def __iter__(self):
yield self.normaliseOutput(data)
data = DefaultDict(None)
key = heading
- data[key]=""
+ data[key]=b""
elif key is not None:
data[key] += line
if data:
@@ -93,15 +93,15 @@ def __iter__(self):
def isSectionHeading(self, line):
"""If the current heading is a test section heading return the heading,
otherwise return False"""
- if line.startswith("#"):
- return line[1:].strip()
+ if line.startswith(b"#"):
+ return str(line[1:].strip(), "ascii")
else:
return False
def normaliseOutput(self, data):
#Remove trailing newlines
- for key,value in data.iteritems():
- if value.endswith("\n"):
+ for key,value in data.items():
+ if value.endswith(b"\n"):
data[key] = value[:-1]
return data
View
44 tests/test_parser.py
@@ -1,7 +1,7 @@
import os
import sys
import traceback
-import StringIO
+import io
import unittest
import warnings
@@ -79,18 +79,28 @@ def runParserTest(self, innerHTML, input, expected, errors, treeClass):
#XXX - move this out into the setup function
#concatenate all consecutive character tokens into a single token
p = html5parser.HTMLParser(tree = treeClass)
-
+
+ if innerHTML:
+ innerHTML = str(innerHTML, "utf8")
+
+ if errors:
+ errors = str(errors, "utf8")
+ errors = errors.split("\n")
+
+ expected = str(expected, "utf8")
+
try:
if innerHTML:
- document = p.parseFragment(StringIO.StringIO(input), innerHTML)
+ document = p.parseFragment(io.BytesIO(input), innerHTML)
else:
try:
- document = p.parse(StringIO.StringIO(input))
+ document = p.parse(io.BytesIO(input))
except constants.DataLossWarning:
sys.stderr.write("Test input causes known dataloss, skipping")
return
except:
- errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
+ errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"),
+ "\nExpected:", expected,
"\nTraceback:", traceback.format_exc()])
self.assertTrue(False, errorMsg)
@@ -99,22 +109,23 @@ def runParserTest(self, innerHTML, input, expected, errors, treeClass):
expected = convertExpected(expected)
expected = attrlist.sub(sortattrs, expected)
- errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
+ errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"),
+ "\nExpected:", expected,
"\nReceived:", output])
self.assertEquals(expected, output, errorMsg)
errStr = ["Line: %i Col: %i %s %s"%(line, col,
constants.E[errorcode], datavars) for
((line,col), errorcode, datavars) in p.errors]
- errorMsg2 = "\n".join(["\n\nInput:", input,
+ errorMsg2 = "\n".join(["\n\nInput:", str(input, "utf8"),
"\nExpected errors (" + str(len(errors)) + "):\n" + "\n".join(errors),
"\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)])
if checkParseErrors:
self.assertEquals(len(p.errors), len(errors), errorMsg2)
def buildTestSuite():
- sys.stdout.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
+ sys.stdout.write('Testing tree builders '+ " ".join(list(treeTypes.keys())) + "\n")
- for treeName, treeCls in treeTypes.iteritems():
+ for treeName, treeCls in treeTypes.items():
files = html5lib_test_files('tree-construction')
files = [f for f in files if
not f.split(".")[-2][-2:] in ("s9", "10", "11", "12")] #skip namespace tests for now
@@ -122,14 +133,12 @@ def buildTestSuite():
testName = os.path.basename(filename).replace(".dat","")
tests = TestData(filename, "data")
-
for index, test in enumerate(tests):
input, errors, innerHTML, expected = [test[key] for key in
- 'data', 'errors',
+ ('data', 'errors',
'document-fragment',
- 'document']
- if errors:
- errors = errors.split("\n")
+ 'document')]
+
def testFunc(self, innerHTML=innerHTML, input=input,
expected=expected, errors=errors, treeCls=treeCls):
return self.runParserTest(innerHTML, input, expected, errors, treeCls)
@@ -150,12 +159,7 @@ def main():
unittest.main()
except SystemExit:
pass
-
- f = open("graph.dot", "w")
- f.write(str(g))
-
- print g.nodes.keys()
if __name__ == "__main__":
- print sys.argv
+ print(sys.argv)
main()
View
41 tests/test_tokenizer.py
@@ -1,10 +1,11 @@
import sys
import os
import unittest
-import cStringIO
+import io
import warnings
+import json as simplejson
-from support import simplejson, html5lib_test_files
+from support import html5lib_test_files
from html5lib.tokenizer import HTMLTokenizer
from html5lib import constants
@@ -24,44 +25,44 @@ def parse(self, stream, encoding=None, innerHTML=False):
tokenizer.currentToken = {"type": "startTag",
"name":self._lastStartTag}
- types = dict((v,k) for k,v in constants.tokenTypes.iteritems())
+ types = dict((v,k) for k,v in constants.tokenTypes.items())
for token in tokenizer:
getattr(self, 'process%s' % types[token["type"]])(token)
return self.outputTokens
def processDoctype(self, token):
- self.outputTokens.append([u"DOCTYPE", token["name"], token["publicId"],
+ self.outputTokens.append(["DOCTYPE", token["name"], token["publicId"],
token["systemId"], token["correct"]])
def processStartTag(self, token):
- self.outputTokens.append([u"StartTag", token["name"],
+ self.outputTokens.append(["StartTag", token["name"],
dict(token["data"][::-1]), token["selfClosing"]])
def processEmptyTag(self, token):
if token["name"] not in constants.voidElements:
- self.outputTokens.append(u"ParseError")
- self.outputTokens.append([u"StartTag", token["name"], dict(token["data"][::-1])])
+ self.outputTokens.append("ParseError")
+ self.outputTokens.append(["StartTag", token["name"], dict(token["data"][::-1])])
def processEndTag(self, token):
- self.outputTokens.append([u"EndTag", token["name"],
+ self.outputTokens.append(["EndTag", token["name"],
token["selfClosing"]])
def processComment(self, token):
- self.outputTokens.append([u"Comment", token["data"]])
+ self.outputTokens.append(["Comment", token["data"]])
def processSpaceCharacters(self, token):
- self.outputTokens.append([u"Character", token["data"]])
+ self.outputTokens.append(["Character", token["data"]])
self.processSpaceCharacters = self.processCharacters
def processCharacters(self, token):
- self.outputTokens.append([u"Character", token["data"]])
+ self.outputTokens.append(["Character", token["data"]])
def processEOF(self, token):
pass
def processParseError(self, token):
- self.outputTokens.append([u"ParseError", token["data"]])
+ self.outputTokens.append(["ParseError", token["data"]])
def concatenateCharacterTokens(tokens):
outputTokens = []
@@ -79,7 +80,7 @@ def concatenateCharacterTokens(tokens):
def normalizeTokens(tokens):
# TODO: convert tests to reflect arrays
for i, token in enumerate(tokens):
- if token[0] == u'ParseError':
+ if token[0] == 'ParseError':
tokens[i] = token[0]
return simplejson.loads(simplejson.dumps(tokens))
@@ -106,7 +107,7 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder):
else:
#Sort the tokens into two groups; non-parse errors and parse errors
tokens = {"expected":[[],[]], "received":[[],[]]}
- for tokenType, tokenList in zip(tokens.keys(),
+ for tokenType, tokenList in zip(list(tokens.keys()),
(expectedTokens, receivedTokens)):
for token in tokenList:
if token != "ParseError":
@@ -124,7 +125,7 @@ def runTokenizerTest(self, test):
expected = concatenateCharacterTokens(test['output'])
if 'lastStartTag' not in test:
test['lastStartTag'] = None
- outBuffer = cStringIO.StringIO()
+ outBuffer = io.StringIO()
stdout = sys.stdout
sys.stdout = outBuffer
parser = TokenizerTestParser(test['contentModelFlag'],
@@ -132,11 +133,11 @@ def runTokenizerTest(self, test):
tokens = parser.parse(test['input'])
tokens = concatenateCharacterTokens(tokens)
received = normalizeTokens(tokens)
- errorMsg = u"\n".join(["\n\nContent Model Flag:",
+ errorMsg = "\n".join(["\n\nContent Model Flag:",
test['contentModelFlag'] ,
- "\nInput:", unicode(test['input']),
- "\nExpected:", unicode(expected),
- "\nreceived:", unicode(tokens)])
+ "\nInput:", str(test['input']),
+ "\nExpected:", str(expected),
+ "\nreceived:", str(tokens)])
errorMsg = errorMsg.encode("utf-8")
ignoreErrorOrder = test.get('ignoreErrorOrder', False)
self.assertEquals(tokensMatch(expected, received, ignoreErrorOrder),
@@ -144,7 +145,7 @@ def runTokenizerTest(self, test):
def buildTestSuite():
for filename in html5lib_test_files('tokenizer', '*.test'):
- tests = simplejson.load(file(filename))
+ tests = simplejson.load(open(filename))
testName = os.path.basename(filename).replace(".test","")
if 'tests' in tests:
for index,test in enumerate(tests['tests']):
View
57 tests/test_treewalkers.py
@@ -1,6 +1,6 @@
import os
import sys
-import StringIO
+import io
import unittest
import warnings
@@ -179,15 +179,15 @@ def convertTokens(tokens):
for token in concatenateCharacterTokens(tokens):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
- output.append(u"%s<%s>" % (" "*indent, token["name"]))
+ output.append("%s<%s>" % (" "*indent, token["name"]))
indent += 2
attrs = token["data"]
if attrs:
if hasattr(attrs, "items"):
- attrs = attrs.items()
+ attrs = list(attrs.items())
attrs.sort()
for name, value in attrs:
- output.append(u"%s%s=\"%s\"" % (" "*indent, name, value))
+ output.append("%s%s=\"%s\"" % (" "*indent, name, value))
if type == "EmptyTag":
indent -= 2
elif type == "EndTag":
@@ -199,8 +199,8 @@ def convertTokens(tokens):
if token["publicId"] or token["systemId"]:
output.append("""%s<!DOCTYPE %s "%s" "%s">"""%
(" "*indent, token["name"],
- token["publicId"],
- token["systemId"]))
+ token["publicId"] or "",
+ token["systemId"] or ""))
else:
output.append("%s<!DOCTYPE %s>"%(" "*indent,
token["name"]))
@@ -210,7 +210,7 @@ def convertTokens(tokens):
output.append("%s\"%s\"" % (" "*indent, token["data"]))
else:
pass # TODO: what to do with errors?
- return u"\n".join(output)
+ return "\n".join(output)
import re
attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+",re.M)
@@ -221,12 +221,20 @@ def sortattrs(x):
class TestCase(unittest.TestCase):
def runTest(self, innerHTML, input, expected, errors, treeClass):
+
+ if innerHTML is not None:
+ innerHTML = str(innerHTML, "utf8")
+ expected = str(expected, "utf8")
+ if errors is not None:
+ errors = str(errors, "utf8")
+ errors = errors.split("\n")
+
p = html5parser.HTMLParser(tree = treeClass["builder"])
try:
if innerHTML:
- document = p.parseFragment(StringIO.StringIO(input), innerHTML)
+ document = p.parseFragment(io.BytesIO(input), innerHTML)
else:
- document = p.parse(StringIO.StringIO(input))
+ document = p.parse(io.BytesIO(input))
except constants.DataLossWarning:
#Ignore testcases we know we don't pass
return
@@ -237,7 +245,7 @@ def runTest(self, innerHTML, input, expected, errors, treeClass):
output = attrlist.sub(sortattrs, output)
expected = attrlist.sub(sortattrs, convertExpected(expected))
self.assertEquals(expected, output, "\n".join([
- "", "Input:", input,
+ "", "Input:", str(input, "utf8"),
"", "Expected:", expected,
"", "Received:", output
]))
@@ -247,19 +255,19 @@ def runTest(self, innerHTML, input, expected, errors, treeClass):
class TokenTestCase(unittest.TestCase):
def test_all_tokens(self):
expected = [
- {'data': [], 'type': 'StartTag', 'name': u'html'},
- {'data': [], 'type': 'StartTag', 'name': u'head'},
- {'data': [], 'type': 'EndTag', 'name': u'head'},
- {'data': [], 'type': 'StartTag', 'name': u'body'},
- {'data': u'a', 'type': 'Characters'},
- {'data': [], 'type': 'StartTag', 'name': u'div'},
- {'data': u'b', 'type': 'Characters'},
- {'data': [], 'type': 'EndTag', 'name': u'div'},
- {'data': u'c', 'type': 'Characters'},
- {'data': [], 'type': 'EndTag', 'name': u'body'},
- {'data': [], 'type': 'EndTag', 'name': u'html'}
+ {'data': [], 'type': 'StartTag', 'name': 'html'},
+ {'data': [], 'type': 'StartTag', 'name': 'head'},
+ {'data': [], 'type': 'EndTag', 'name': 'head'},
+ {'data': [], 'type': 'StartTag', 'name': 'body'},
+ {'data': 'a', 'type': 'Characters'},
+ {'data': [], 'type': 'StartTag', 'name': 'div'},
+ {'data': 'b', 'type': 'Characters'},
+ {'data': [], 'type': 'EndTag', 'name': 'div'},
+ {'data': 'c', 'type': 'Characters'},
+ {'data': [], 'type': 'EndTag', 'name': 'body'},
+ {'data': [], 'type': 'EndTag', 'name': 'html'}
]
- for treeName, treeCls in treeTypes.iteritems():
+ for treeName, treeCls in treeTypes.items():
p = html5parser.HTMLParser(tree = treeCls["builder"])
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
document = treeCls.get("adapter", lambda x: x)(document)
@@ -269,9 +277,9 @@ def test_all_tokens(self):
def buildTestSuite():
- sys.stdout.write('Testing tree walkers '+ " ".join(treeTypes.keys()) + "\n")
+ sys.stdout.write('Testing tree walkers '+ " ".join(list(treeTypes.keys())) + "\n")
- for treeName, treeCls in treeTypes.iteritems():
+ for treeName, treeCls in treeTypes.items():
files = html5lib_test_files('tree-construction')
files = [f for f in files if
not f.split(".")[-2][-2:] in ("s9", "10", "11", "12")] #skip namespace tests for now
@@ -286,7 +294,6 @@ def buildTestSuite():
innerHTML, expected) = [test[key] for key in ("data", "errors",
"document-fragment",
"document")]
- errors = errors.split("\n")
def testFunc(self, innerHTML=innerHTML, input=input,
expected=expected, errors=errors, treeCls=treeCls):
self.runTest(innerHTML, input, expected, errors, treeCls)
Please sign in to comment.
Something went wrong with that request. Please try again.