Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

resync with html5lib and feedparser

  • Loading branch information...
commit 77970dbaaa7edf25644726b4500633a1d0fd9de2 1 parent 1bcee5c
Sam Ruby authored
Showing with 1,254 additions and 638 deletions.
  1. +6 −2 planet/reconstitute.py
  2. +18 −7 planet/scrub.py
  3. +4 −1 planet/vendor/feedparser.py
  4. +4 −3 planet/vendor/html5lib/__init__.py
  5. +46 −7 planet/vendor/html5lib/constants.py
  6. +277 −276 planet/vendor/html5lib/html5parser.py
  7. +29 −22 planet/vendor/html5lib/ihatexml.py
  8. +54 −43 planet/vendor/html5lib/inputstream.py
  9. +3 −1 planet/vendor/html5lib/sanitizer.py
  10. +58 −10 planet/vendor/html5lib/serializer/htmlserializer.py
  11. +579 −164 planet/vendor/html5lib/tokenizer.py
  12. +16 −1 planet/vendor/html5lib/treebuilders/__init__.py
  13. +24 −26 planet/vendor/html5lib/treebuilders/_base.py
  14. +20 −26 planet/vendor/html5lib/treebuilders/dom.py
  15. +7 −4 planet/vendor/html5lib/treebuilders/etree.py
  16. +18 −14 planet/vendor/html5lib/treebuilders/etree_lxml.py
  17. +25 −9 planet/vendor/html5lib/treebuilders/simpletree.py
  18. +13 −6 planet/vendor/html5lib/treebuilders/soup.py
  19. +14 −4 planet/vendor/html5lib/treewalkers/_base.py
  20. +0 −1  planet/vendor/html5lib/treewalkers/dom.py
  21. +3 −3 planet/vendor/html5lib/treewalkers/genshistream.py
  22. +6 −0 planet/vendor/html5lib/treewalkers/lxmletree.py
  23. +5 −4 planet/vendor/html5lib/treewalkers/soup.py
  24. +20 −1 planet/vendor/html5lib/utils.py
  25. +2 −1  planet/vendor/httplib2/__init__.py
  26. +1 −1  tests/data/reconstitute/content_illegal_char.xml
  27. +2 −1  tests/test_reconstitute.py
8 planet/reconstitute.py
View
@@ -25,7 +25,7 @@
except:
from md5 import new as md5
-illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
+illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]", re.UNICODE)
def createTextElement(parent, name, value):
""" utility function to create a child element with the specified text"""
@@ -35,6 +35,7 @@ def createTextElement(parent, name, value):
value=value.decode('utf-8')
except:
value=value.decode('iso-8859-1')
+ value = illegal_xml_chars.sub(invalidate, value)
xdoc = parent.ownerDocument
xelement = xdoc.createElement(name)
xelement.appendChild(xdoc.createTextNode(value))
@@ -43,7 +44,7 @@ def createTextElement(parent, name, value):
def invalidate(c):
""" replace invalid characters """
- return '<acronym title="U+%s">\xef\xbf\xbd</acronym>' % \
+ return u'<abbr title="U+%s">\ufffd</abbr>' % \
('000' + hex(ord(c.group(0)))[2:])[-4:]
def ncr2c(value):
@@ -177,6 +178,9 @@ def content(xentry, name, detail, bozo):
if len(div.childNodes) == 1 and \
div.firstChild.nodeType == Node.TEXT_NODE:
data = div.firstChild
+ if illegal_xml_chars.search(data.data):
+ data = xdoc.createTextNode(
+ illegal_xml_chars.sub(invalidate, data.data))
else:
data = div
xcontent.setAttribute('type', 'xhtml')
25 planet/scrub.py
View
@@ -128,13 +128,24 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
- # Run this through HTML5's serializer
- from html5lib import html5parser, sanitizer, treebuilders
+ # Run this through HTML5's sanitizer
+ doc = None
+ if 'xhtml' in node['type']:
+ try:
+ from xml.dom import minidom
+ doc = minidom.parseString(node['value'])
+ except:
+ node['type']='text/html'
+
+ if not doc:
+ from html5lib import html5parser, treebuilders
+ p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+ doc = p.parseFragment(node['value'], encoding='utf-8')
+
from html5lib import treewalkers, serializer
- p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
- tree=treebuilders.getTreeBuilder('dom'))
- doc = p.parseFragment(node.value, encoding='utf-8')
+ from html5lib.filters import sanitizer
+ walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
- walker = treewalkers.getTreeWalker('dom')
- tree = xhtml.serialize(walker(doc), encoding='utf-8')
+ tree = xhtml.serialize(walker, encoding='utf-8')
+
node['value'] = ''.join([str(token) for token in tree])
5 planet/vendor/feedparser.py
View
@@ -1595,9 +1595,12 @@ def __init__(self, baseuri, baselang, encoding):
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
+ self.decls = {}
def startPrefixMapping(self, prefix, uri):
self.trackNamespace(prefix, uri)
+ if uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:'+prefix] = uri
def startElementNS(self, name, qname, attrs):
namespace, localname = name
@@ -1622,7 +1625,7 @@ def startElementNS(self, name, qname, attrs):
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
- attrsD = {}
+ attrsD, self.decls = self.decls, {}
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
attrsD['xmlns']=namespace
if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
7 planet/vendor/html5lib/__init__.py
View
@@ -8,9 +8,10 @@
import html5lib
f = open("my_document.html")
-p = html5lib.HTMLParser()
-tree = p.parse(f)
+tree = html5lib.parse(f)
"""
-from html5parser import HTMLParser, parse
+__version__ = "%(version)s"
+from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder
+from treewalkers import getTreeWalker
from serializer import serialize
53 planet/vendor/html5lib/constants.py
View
@@ -180,6 +180,8 @@
u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."),
+ "unexpected-form-in-table":
+ _(u"Unexpected form in table context."),
"unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."),
@@ -256,21 +258,18 @@
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
+ "eof-in-script-in-script":
+ _(u"Unexpected end of file. Expected script content."),
"non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"),
+ "unexpected-end-tag-before-html":
+ _(u"Unexpected end tag (%(name)s) before html."),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}
-contentModelFlags = {
- "PCDATA":0,
- "RCDATA":1,
- "CDATA":2,
- "PLAINTEXT":3
-}
-
namespaces = {
"html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML",
@@ -509,6 +508,8 @@
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
)
+xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
+
entities = {
"AElig;": u"\u00C6",
"AElig": u"\u00C6",
@@ -878,6 +879,44 @@
"zwnj;": u"\u200C"
}
+replacementCharacters = {
+ 0x0:u"\uFFFD",
+ 0x0d:u"\u000A",
+ 0x80:u"\u20AC",
+ 0x81:u"\u0081",
+ 0x81:u"\u0081",
+ 0x82:u"\u201A",
+ 0x83:u"\u0192",
+ 0x84:u"\u201E",
+ 0x85:u"\u2026",
+ 0x86:u"\u2020",
+ 0x87:u"\u2021",
+ 0x88:u"\u02C6",
+ 0x89:u"\u2030",
+ 0x8A:u"\u0160",
+ 0x8B:u"\u2039",
+ 0x8C:u"\u0152",
+ 0x8D:u"\u008D",
+ 0x8E:u"\u017D",
+ 0x8F:u"\u008F",
+ 0x90:u"\u0090",
+ 0x91:u"\u2018",
+ 0x92:u"\u2019",
+ 0x93:u"\u201C",
+ 0x94:u"\u201D",
+ 0x95:u"\u2022",
+ 0x96:u"\u2013",
+ 0x97:u"\u2014",
+ 0x98:u"\u02DC",
+ 0x99:u"\u2122",
+ 0x9A:u"\u0161",
+ 0x9B:u"\u203A",
+ 0x9C:u"\u0153",
+ 0x9D:u"\u009D",
+ 0x9E:u"\u017E",
+ 0x9F:u"\u0178",
+}
+
encodings = {
'437': 'cp437',
'850': 'cp850',
553 planet/vendor/html5lib/html5parser.py
View
@@ -4,6 +4,17 @@
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
+
+try:
+ any
+except:
+ # Implement 'any' for python 2.4 and previous
+ def any(iterable):
+ for element in iterable:
+ if element:
+ return True
+ return False
+
import sys
import inputstream
@@ -14,7 +25,7 @@
from treebuilders import simpletree
import utils
-from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
+from constants import spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
from constants import headingElements, tableInsertModeElements
from constants import cdataElements, rcdataElements, voidElements
@@ -26,6 +37,12 @@ def parse(doc, treebuilder="simpletree", encoding=None,
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, encoding=encoding)
+def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
+ namespaceHTMLElements=True):
+ tb = treebuilders.getTreeBuilder(treebuilder)
+ p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+ return p.parseFragment(doc, container=container, encoding=encoding)
+
class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
@@ -60,7 +77,7 @@ def __init__(self, tree = simpletree.TreeBuilder,
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
"afterHead": AfterHeadPhase(self, self.tree),
"inBody": InBodyPhase(self, self.tree),
- "inCDataRCData": InCDataRCDataPhase(self, self.tree),
+ "text": TextPhase(self, self.tree),
"inTable": InTablePhase(self, self.tree),
"inTableText": InTableTextPhase(self, self.tree),
"inCaption": InCaptionPhase(self, self.tree),
@@ -107,14 +124,14 @@ def reset(self):
self.innerHTML = self.container.lower()
if self.innerHTML in cdataElements:
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
+ self.tokenizer.state = self.tokenizer.rcdataState
elif self.innerHTML in rcdataElements:
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
+ self.tokenizer.state = self.tokenizer.rawtextState
elif self.innerHTML == 'plaintext':
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
+ self.tokenizer.state = self.tokenizer.plaintextState
else:
- # contentModelFlag already is PCDATA
- #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
+ # state already is data state
+ # self.tokenizer.state = self.tokenizer.dataState
pass
self.phase = self.phases["beforeHtml"]
self.phase.insertHtmlElement()
@@ -152,8 +169,6 @@ def mainLoop(self):
for token in self.normalizedTokens():
- #print self.phase.__class__.__name__
- #print token
type = token["type"]
if type == CharactersToken:
self.phase.processCharacters(token)
@@ -376,18 +391,22 @@ def resetInsertionMode(self):
self.phase = self.phases["inBody"]
break
- def parseRCDataCData(self, token, contentType):
- """Generic (R)CDATA Parsing algorithm
- contentType - RCDATA or CDATA
+ def parseRCDataRawtext(self, token, contentType):
+ """Generic RCDATA/RAWTEXT Parsing algorithm
+ contentType - RCDATA or RAWTEXT
"""
- assert contentType in ("CDATA", "RCDATA")
+ assert contentType in ("RAWTEXT", "RCDATA")
element = self.tree.insertElement(token)
- self.tokenizer.contentModelFlag = contentModelFlags[contentType]
+
+ if contentType == "RAWTEXT":
+ self.tokenizer.state = self.tokenizer.rawtextState
+ else:
+ self.tokenizer.state = self.tokenizer.rcdataState
self.originalPhase = self.phase
- self.phase = self.phases["inCDataRCData"]
+ self.phase = self.phases["text"]
class Phase(object):
"""Base class for helper object that implements each phase of processing
@@ -441,34 +460,24 @@ def processEndTag(self, token):
self.endTagHandler[token["name"]](token)
class InitialPhase(Phase):
- # This phase deals with error handling as well which is currently not
- # covered in the specification. The error handling is typically known as
- # "quirks mode". It is expected that a future version of HTML5 will defin
- # this.
- def processEOF(self):
- self.parser.parseError("expected-doctype-but-got-eof")
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
- self.parser.phase.processEOF()
-
+ def processSpaceCharacters(self, token):
+ pass
+
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processDoctype(self, token):
-
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
correct = token["correct"]
if (name != "html" or publicId != None or
- systemId != None):
+ systemId != None and systemId != "about:legacy-compat"):
self.parser.parseError("unknown-doctype")
if publicId is None:
publicId = ""
- if systemId is None:
- systemId = ""
self.tree.insertDoctype(token)
@@ -476,117 +485,108 @@ def processDoctype(self, token):
publicId = publicId.translate(asciiUpper2Lower)
if (not correct or token["name"] != "html"
- or publicId in
- ("+//silmaril//dtd html pro v0r11 19970101//en",
- "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
- "-//as//dtd html 3.0 aswedit + extensions//en",
- "-//ietf//dtd html 2.0 level 1//en",
- "-//ietf//dtd html 2.0 level 2//en",
- "-//ietf//dtd html 2.0 strict level 1//en",
- "-//ietf//dtd html 2.0 strict level 2//en",
- "-//ietf//dtd html 2.0 strict//en",
- "-//ietf//dtd html 2.0//en",
- "-//ietf//dtd html 2.1e//en",
- "-//ietf//dtd html 3.0//en",
- "-//ietf//dtd html 3.0//en//",
- "-//ietf//dtd html 3.2 final//en",
- "-//ietf//dtd html 3.2//en",
- "-//ietf//dtd html 3//en",
- "-//ietf//dtd html level 0//en",
- "-//ietf//dtd html level 0//en//2.0",
- "-//ietf//dtd html level 1//en",
- "-//ietf//dtd html level 1//en//2.0",
- "-//ietf//dtd html level 2//en",
- "-//ietf//dtd html level 2//en//2.0",
- "-//ietf//dtd html level 3//en",
- "-//ietf//dtd html level 3//en//3.0",
- "-//ietf//dtd html strict level 0//en",
- "-//ietf//dtd html strict level 0//en//2.0",
- "-//ietf//dtd html strict level 1//en",
- "-//ietf//dtd html strict level 1//en//2.0",
- "-//ietf//dtd html strict level 2//en",
- "-//ietf//dtd html strict level 2//en//2.0",
- "-//ietf//dtd html strict level 3//en",
- "-//ietf//dtd html strict level 3//en//3.0",
- "-//ietf//dtd html strict//en",
- "-//ietf//dtd html strict//en//2.0",
- "-//ietf//dtd html strict//en//3.0",
- "-//ietf//dtd html//en",
- "-//ietf//dtd html//en//2.0",
- "-//ietf//dtd html//en//3.0",
- "-//metrius//dtd metrius presentational//en",
- "-//microsoft//dtd internet explorer 2.0 html strict//en",
- "-//microsoft//dtd internet explorer 2.0 html//en",
- "-//microsoft//dtd internet explorer 2.0 tables//en",
- "-//microsoft//dtd internet explorer 3.0 html strict//en",
- "-//microsoft//dtd internet explorer 3.0 html//en",
- "-//microsoft//dtd internet explorer 3.0 tables//en",
- "-//netscape comm. corp.//dtd html//en",
- "-//netscape comm. corp.//dtd strict html//en",
- "-//o'reilly and associates//dtd html 2.0//en",
- "-//o'reilly and associates//dtd html extended 1.0//en",
- "-//o'reilly and associates//dtd html extended relaxed 1.0//en",
- "-//spyglass//dtd html 2.0 extended//en",
- "-//sq//dtd html 2.0 hotmetal + extensions//en",
- "-//sun microsystems corp.//dtd hotjava html//en",
- "-//sun microsystems corp.//dtd hotjava strict html//en",
- "-//w3c//dtd html 3 1995-03-24//en",
- "-//w3c//dtd html 3.2 draft//en",
- "-//w3c//dtd html 3.2 final//en",
- "-//w3c//dtd html 3.2//en",
- "-//w3c//dtd html 3.2s draft//en",
- "-//w3c//dtd html 4.0 frameset//en",
- "-//w3c//dtd html 4.0 transitional//en",
- "-//w3c//dtd html experimental 19960712//en",
- "-//w3c//dtd html experimental 970421//en",
- "-//w3c//dtd w3 html//en",
- "-//w3o//dtd w3 html 3.0//en",
- "-//w3o//dtd w3 html 3.0//en//",
- "-//w3o//dtd w3 html strict 3.0//en//",
- "-//webtechs//dtd mozilla html 2.0//en",
- "-//webtechs//dtd mozilla html//en",
- "-/w3c/dtd html 4.0 transitional/en",
- "html")
- or (publicId in
- ("-//w3c//dtd html 4.01 frameset//EN",
- "-//w3c//dtd html 4.01 transitional//EN") and
- systemId == None)
- or (systemId != None and
- systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
+ or publicId.startswith(
+ ("+//silmaril//dtd html pro v0r11 19970101//",
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+ "-//as//dtd html 3.0 aswedit + extensions//",
+ "-//ietf//dtd html 2.0 level 1//",
+ "-//ietf//dtd html 2.0 level 2//",
+ "-//ietf//dtd html 2.0 strict level 1//",
+ "-//ietf//dtd html 2.0 strict level 2//",
+ "-//ietf//dtd html 2.0 strict//",
+ "-//ietf//dtd html 2.0//",
+ "-//ietf//dtd html 2.1e//",
+ "-//ietf//dtd html 3.0//",
+ "-//ietf//dtd html 3.2 final//",
+ "-//ietf//dtd html 3.2//",
+ "-//ietf//dtd html 3//",
+ "-//ietf//dtd html level 0//",
+ "-//ietf//dtd html level 1//",
+ "-//ietf//dtd html level 2//",
+ "-//ietf//dtd html level 3//",
+ "-//ietf//dtd html strict level 0//",
+ "-//ietf//dtd html strict level 1//",
+ "-//ietf//dtd html strict level 2//",
+ "-//ietf//dtd html strict level 3//",
+ "-//ietf//dtd html strict//",
+ "-//ietf//dtd html//",
+ "-//metrius//dtd metrius presentational//",
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
+ "-//microsoft//dtd internet explorer 2.0 html//",
+ "-//microsoft//dtd internet explorer 2.0 tables//",
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
+ "-//microsoft//dtd internet explorer 3.0 html//",
+ "-//microsoft//dtd internet explorer 3.0 tables//",
+ "-//netscape comm. corp.//dtd html//",
+ "-//netscape comm. corp.//dtd strict html//",
+ "-//o'reilly and associates//dtd html 2.0//",
+ "-//o'reilly and associates//dtd html extended 1.0//",
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+ "-//spyglass//dtd html 2.0 extended//",
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
+ "-//sun microsystems corp.//dtd hotjava html//",
+ "-//sun microsystems corp.//dtd hotjava strict html//",
+ "-//w3c//dtd html 3 1995-03-24//",
+ "-//w3c//dtd html 3.2 draft//",
+ "-//w3c//dtd html 3.2 final//",
+ "-//w3c//dtd html 3.2//",
+ "-//w3c//dtd html 3.2s draft//",
+ "-//w3c//dtd html 4.0 frameset//",
+ "-//w3c//dtd html 4.0 transitional//",
+ "-//w3c//dtd html experimental 19960712//",
+ "-//w3c//dtd html experimental 970421//",
+ "-//w3c//dtd w3 html//",
+ "-//w3o//dtd w3 html 3.0//",
+ "-//webtechs//dtd mozilla html 2.0//",
+ "-//webtechs//dtd mozilla html//"))
+ or publicId in
+ ("-//w3o//dtd w3 html strict 3.0//en//",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html")
+ or publicId.startswith(
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId == None
+ or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks"
- elif (publicId in
- ("-//w3c//dtd xhtml 1.0 frameset//EN",
- "-//w3c//dtd xhtml 1.0 transitional//EN")
- or (publicId in
- ("-//w3c//dtd html 4.01 frameset//EN",
- "-//w3c//dtd html 4.01 transitional//EN") and
- systemId == None)):
+ elif (publicId.startswith(
+ ("-//w3c//dtd xhtml 1.0 frameset//",
+ "-//w3c//dtd xhtml 1.0 transitional//"))
+ or publicId.startswith(
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId != None):
self.parser.compatMode = "limited quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
-
- def processSpaceCharacters(self, token):
- pass
+
+ def anythingElse(self):
+ self.parser.compatMode = "quirks"
+ self.parser.phase = self.parser.phases["beforeHtml"]
def processCharacters(self, token):
self.parser.parseError("expected-doctype-but-got-chars")
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processCharacters(token)
def processStartTag(self, token):
self.parser.parseError("expected-doctype-but-got-start-tag",
{"name": token["name"]})
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processStartTag(token)
def processEndTag(self, token):
self.parser.parseError("expected-doctype-but-got-end-tag",
{"name": token["name"]})
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processEndTag(token)
+
+ def processEOF(self):
+ self.parser.parseError("expected-doctype-but-got-eof")
+ self.anythingElse()
+ self.parser.phase.processEOF()
class BeforeHtmlPhase(Phase):
@@ -617,8 +617,12 @@ def processStartTag(self, token):
self.parser.phase.processStartTag(token)
def processEndTag(self, token):
- self.insertHtmlElement()
- self.parser.phase.processEndTag(token)
+ if token["name"] not in ("head", "body", "html", "br"):
+ self.parser.parseError("unexpected-end-tag-before-html",
+ {"name": token["name"]})
+ else:
+ self.insertHtmlElement()
+ self.parser.phase.processEndTag(token)
class BeforeHeadPhase(Phase):
@@ -632,7 +636,7 @@ def __init__(self, parser, tree):
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
- (("head", "br"), self.endTagImplyHead)
+ (("head", "body", "html", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
@@ -647,6 +651,9 @@ def processCharacters(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
self.parser.phase.processCharacters(token)
+ def startTagHtml(self, token):
+ self.parser.phases["inBody"].processStartTag(token)
+
def startTagHead(self, token):
self.tree.insertElement(token)
self.tree.headPointer = self.tree.openElements[-1]
@@ -673,8 +680,8 @@ def __init__(self, parser, tree):
("title", self.startTagTitle),
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
("script", self.startTagScript),
- (("base", "link", "command", "eventsource"),
- self.startTagBaseLinkCommandEventsource),
+ (("base", "link", "command"),
+ self.startTagBaseLinkCommand),
("meta", self.startTagMeta),
("head", self.startTagHead)
])
@@ -709,7 +716,7 @@ def startTagHtml(self, token):
def startTagHead(self, token):
self.parser.parseError("two-heads-are-not-better-than-one")
- def startTagBaseLinkCommandEventsource(self, token):
+ def startTagBaseLinkCommand(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
@@ -724,23 +731,27 @@ def startTagMeta(self, token):
if "charset" in attributes:
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
elif "content" in attributes:
- data = inputstream.EncodingBytes(
- attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))
+ # Encoding it as UTF-8 here is a hack, as really we should pass
+ # the abstract Unicode string, and just use the
+ # ContentAttrParser on that, but using UTF-8 allows all chars
+ # to be encoded and as a ASCII-superset works.
+ data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = inputstream.ContentAttrParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token):
- self.parser.parseRCDataCData(token, "RCDATA")
+ self.parser.parseRCDataRawtext(token, "RCDATA")
def startTagNoScriptNoFramesStyle(self, token):
#Need to decide whether to implement the scripting-disabled case
- self.parser.parseRCDataCData(token, "CDATA")
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagScript(self, token):
- #I think this is equivalent to the CDATA stuff since we don't execute script
- #self.tree.insertElement(token)
- self.parser.parseRCDataCData(token, "CDATA")
+ self.tree.insertElement(token)
+ self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
+ self.parser.originalPhase = self.parser.phase
+ self.parser.phase = self.parser.phases["text"]
def startTagOther(self, token):
self.anythingElse()
@@ -819,7 +830,6 @@ def startTagOther(self, token):
self.parser.phase.processStartTag(token)
def endTagHtmlBodyBr(self, token):
- #This is not currently in the spec
self.anythingElse()
self.parser.phase.processEndTag(token)
@@ -833,8 +843,8 @@ def anythingElse(self):
class InBodyPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-body
- # the crazy mode
+ # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
+ # the really-really-really-very crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
@@ -843,15 +853,16 @@ def __init__(self, parser, tree):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
- (("base", "link", "meta", "script", "style", "title"),
- self.startTagProcessInHead),
+ (("base", "command", "link", "meta", "noframes", "script", "style",
+ "title"), self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "datagrid",
- "details", "dialog", "dir", "div", "dl", "fieldset", "figure",
- "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "listing",
- "menu", "nav", "ol", "p", "pre", "section", "ul"),
+ "details", "dir", "div", "dl", "fieldset", "figure",
+ "footer", "header", "hgroup", "menu", "nav", "ol", "p",
+ "section", "ul"),
self.startTagCloseP),
+ (("pre", "listing"), self.startTagPreListing),
("form", self.startTagForm),
(("li", "dd", "dt"), self.startTagListItem),
("plaintext",self.startTagPlaintext),
@@ -865,13 +876,14 @@ def __init__(self, parser, tree):
("xmp", self.startTagXmp),
("table", self.startTagTable),
(("area", "basefont", "bgsound", "br", "embed", "img", "input",
- "keygen", "param", "spacer", "wbr"), self.startTagVoidFormatting),
+ "keygen", "spacer", "wbr"), self.startTagVoidFormatting),
+ (("param", "source"), self.startTagParamSource),
("hr", self.startTagHr),
("image", self.startTagImage),
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
("iframe", self.startTagIFrame),
- (("noembed", "noframes", "noscript"), self.startTagCdata),
+ (("noembed", "noframes", "noscript"), self.startTagRawtext),
("select", self.startTagSelect),
(("rp", "rt"), self.startTagRpRt),
(("option", "optgroup"), self.startTagOpt),
@@ -879,8 +891,7 @@ def __init__(self, parser, tree):
(("svg"), self.startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
- "tr"), self.startTagMisplaced),
- (("event-source", "command"), self.startTagNew)
+ "tr"), self.startTagMisplaced)
])
self.startTagHandler.default = self.startTagOther
@@ -888,9 +899,9 @@ def __init__(self, parser, tree):
("body",self.endTagBody),
("html",self.endTagHtml),
(("address", "article", "aside", "blockquote", "center", "datagrid",
- "details", "dialog", "dir", "div", "dl", "fieldset", "figure",
- "footer", "header", "listing", "menu", "nav", "ol", "pre", "section",
- "ul"), self.endTagBlock),
+ "details", "dir", "div", "dl", "fieldset", "figure",
+ "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre",
+ "section", "ul"), self.endTagBlock),
("form", self.endTagForm),
("p",self.endTagP),
(("dd", "dt", "li"), self.endTagListItem),
@@ -933,14 +944,10 @@ def processSpaceCharactersDropNewline(self, token):
self.tree.insertText(data)
def processCharacters(self, token):
- # XXX The specification says to do this for every character at the
- # moment, but apparently that doesn't match the real world so we don't
- # do it for space characters.
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
- self.framesetOK = False
+ self.parser.framesetOK = False
- #This matches the current spec but may not match the real world
def processSpaceCharacters(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
@@ -976,9 +983,13 @@ def startTagCloseP(self, token):
if self.tree.elementInScope("p"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
- if token["name"] in ("pre", "listing"):
- self.parser.framesetOK = False
- self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+
+ def startTagPreListing(self, token):
+ if self.tree.elementInScope("p"):
+ self.endTagP(impliedTagToken("p"))
+ self.tree.insertElement(token)
+ self.parser.framesetOK = False
+ self.processSpaceCharacters = self.processSpaceCharactersDropNewline
def startTagForm(self, token):
if self.tree.formPointer:
@@ -991,39 +1002,31 @@ def startTagForm(self, token):
def startTagListItem(self, token):
self.parser.framesetOK = False
- if self.tree.elementInScope("p"):
- self.endTagP(impliedTagToken("p"))
- stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")}
- stopName = stopNames[token["name"]]
- # AT Use reversed in Python 2.4...
- for i, node in enumerate(self.tree.openElements[::-1]):
- if node.name in stopName:
- poppedNodes = []
- for j in range(i+1):
- poppedNodes.append(self.tree.openElements.pop())
- if i >= 1:
- self.parser.parseError(
- i == 1 and "missing-end-tag" or "missing-end-tags",
- {"name": u", ".join([item.name
- for item
- in poppedNodes[:-1]])})
- break
-
- # Phrasing elements are all non special, non scoping, non
- # formatting elements
- if (node.nameTuple in
- (specialElements | scopingElements)
- and node.name not in ("address", "div")):
+ stopNamesMap = {"li":["li"],
+ "dt":["dt", "dd"],
+ "dd":["dt", "dd"]}
+ stopNames = stopNamesMap[token["name"]]
+ for node in reversed(self.tree.openElements):
+ if node.name in stopNames:
+ self.parser.phase.processEndTag(
+ impliedTagToken(node.name, "EndTag"))
break
- # Always insert an <li> element.
+ if (node.nameTuple in (scopingElements | specialElements) and
+ node.name not in ("address", "div", "p")):
+ break
+
+ if self.tree.elementInScope("p"):
+ self.parser.phase.processEndTag(
+ impliedTagToken("p", "EndTag"))
+
self.tree.insertElement(token)
def startTagPlaintext(self, token):
if self.tree.elementInScope("p"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
- self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"]
+ self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
def startTagHeading(self, token):
if self.tree.elementInScope("p"):
@@ -1031,15 +1034,6 @@ def startTagHeading(self, token):
if self.tree.openElements[-1].name in headingElements:
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
self.tree.openElements.pop()
- # Uncomment the following for IE7 behavior:
- #
- #for item in headingElements:
- # if self.tree.elementInScope(item):
- # self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
- # item = self.tree.openElements.pop()
- # while item.name not in headingElements:
- # item = self.tree.openElements.pop()
- # break
self.tree.insertElement(token)
def startTagA(self, token):
@@ -1088,9 +1082,11 @@ def startTagAppletMarqueeObject(self, token):
self.parser.framesetOK = False
def startTagXmp(self, token):
+ if self.tree.elementInScope("p"):
+ self.endTagP(impliedTagToken("p"))
self.tree.reconstructActiveFormattingElements()
- self.parser.parseRCDataCData(token, "CDATA")
self.parser.framesetOK = False
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagTable(self, token):
if self.parser.compatMode != "quirks":
@@ -1107,6 +1103,11 @@ def startTagVoidFormatting(self, token):
token["selfClosingAcknowledged"] = True
self.parser.framesetOK = False
+ def startTagParamSource(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
def startTagHr(self, token):
if self.tree.elementInScope("p"):
self.endTagP(impliedTagToken("p"))
@@ -1156,19 +1157,18 @@ def startTagIsIndex(self, token):
self.processEndTag(impliedTagToken("form"))
def startTagTextarea(self, token):
- # XXX Form element pointer checking here as well...
self.tree.insertElement(token)
- self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
+ self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
self.parser.framesetOK = False
def startTagIFrame(self, token):
self.parser.framesetOK = False
- self.startTagCdata(token)
+ self.startTagRawtext(token)
- def startTagCdata(self, token):
+ def startTagRawtext(self, token):
"""iframe, noembed noframes, noscript(if scripting enabled)"""
- self.parser.parseRCDataCData(token, "CDATA")
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagOpt(self, token):
if self.tree.elementInScope("option"):
@@ -1238,46 +1238,34 @@ def startTagMisplaced(self, token):
"""
self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
- def startTagNew(self, token):
- """New HTML5 elements, "event-source", "section", "nav",
- "article", "aside", "header", "footer", "datagrid", "command"
- """
- #2007-08-30 - MAP - commenting out this write to sys.stderr because
- # it's really annoying me when I run the validator tests
- #sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
- self.startTagOther(token)
- #raise NotImplementedError
-
def startTagOther(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
def endTagP(self, token):
- if self.tree.elementInScope("p"):
- self.tree.generateImpliedEndTags("p")
- if self.tree.openElements[-1].name != "p":
+ if not self.tree.elementInScope("p"):
+ self.startTagCloseP(impliedTagToken("p", "StartTag"))
self.parser.parseError("unexpected-end-tag", {"name": "p"})
- if self.tree.elementInScope("p"):
- while self.tree.elementInScope("p"):
- self.tree.openElements.pop()
+ self.endTagP(impliedTagToken("p", "EndTag"))
else:
- self.startTagCloseP(impliedTagToken("p", "StartTag"))
- self.endTagP(impliedTagToken("p"))
+ self.tree.generateImpliedEndTags("p")
+ if self.tree.openElements[-1].name != "p":
+ self.parser.parseError("unexpected-end-tag", {"name": "p"})
+ node = self.tree.openElements.pop()
+ while node.name != "p":
+ node = self.tree.openElements.pop()
def endTagBody(self, token):
- # XXX Need to take open <p> tags into account here. We shouldn't imply
- # </p> but we should not throw a parse error either. Specification is
- # likely to be updated.
- if (len(self.tree.openElements) == 1 or
- self.tree.openElements[1].name != "body"):
- # innerHTML case
+ if not self.tree.elementInScope("body"):
self.parser.parseError()
return
elif self.tree.openElements[-1].name != "body":
for node in self.tree.openElements[2:]:
- if node.name not in frozenset(("dd", "dt", "li", "p",
+ if node.name not in frozenset(("dd", "dt", "li", "optgroup",
+ "option", "p", "rp", "rt",
"tbody", "td", "tfoot",
- "th", "thead", "tr")):
+ "th", "thead", "tr", "body",
+ "html")):
#Not sure this is the correct name for the parse error
self.parser.parseError(
"expected-one-end-tag-but-got-another",
@@ -1286,8 +1274,9 @@ def endTagBody(self, token):
self.parser.phase = self.parser.phases["afterBody"]
def endTagHtml(self, token):
- self.endTagBody(impliedTagToken("body"))
- if not self.parser.innerHTML:
+ #We repeat the test for the body end tag token being ignored here
+ if self.tree.elementInScope("body"):
+ self.endTagBody(impliedTagToken("body"))
self.parser.phase.processEndTag(token)
def endTagBlock(self, token):
@@ -1307,7 +1296,7 @@ def endTagBlock(self, token):
def endTagForm(self, token):
node = self.tree.formPointer
self.tree.formPointer = None
- if node is None or not self.tree.elementInScope(token["name"]):
+ if node is None or not self.tree.elementInScope(node.name):
self.parser.parseError("unexpected-end-tag",
{"name":"form"})
else:
@@ -1315,17 +1304,21 @@ def endTagForm(self, token):
if self.tree.openElements[-1].name != node:
self.parser.parseError("end-tag-too-early-ignored",
{"name": "form"})
- self.tree.openElements.remove(node)
+ self.tree.openElements.remove(node)
def endTagListItem(self, token):
- # AT Could merge this with the Block case
- if self.tree.elementInScope(token["name"]):
- self.tree.generateImpliedEndTags(token["name"])
-
- if self.tree.openElements[-1].name != token["name"]:
- self.parser.parseError("end-tag-too-early", {"name": token["name"]})
-
- if self.tree.elementInScope(token["name"]):
+ if token["name"] == "li":
+ variant = "list"
+ else:
+ variant = None
+ if not self.tree.elementInScope(token["name"], variant=variant):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+ else:
+ self.tree.generateImpliedEndTags(exclude = token["name"])
+ if self.tree.openElements[-1].name != token["name"]:
+ self.parser.parseError(
+ "end-tag-too-early",
+ {"name": token["name"]})
node = self.tree.openElements.pop()
while node.name != token["name"]:
node = self.tree.openElements.pop()
@@ -1352,26 +1345,28 @@ def endTagFormatting(self, token):
name = token["name"]
while True:
# Step 1 paragraph 1
- afeElement = self.tree.elementInActiveFormattingElements(
+ formattingElement = self.tree.elementInActiveFormattingElements(
token["name"])
- if not afeElement or (afeElement in self.tree.openElements and
- not self.tree.elementInScope(afeElement.name)):
+ if not formattingElement or (formattingElement in
+ self.tree.openElements and
+ not self.tree.elementInScope(
+ formattingElement.name)):
self.parser.parseError("adoption-agency-1.1", {"name": token["name"]})
return
# Step 1 paragraph 2
- elif afeElement not in self.tree.openElements:
+ elif formattingElement not in self.tree.openElements:
self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
- self.tree.activeFormattingElements.remove(afeElement)
+ self.tree.activeFormattingElements.remove(formattingElement)
return
# Step 1 paragraph 3
- if afeElement != self.tree.openElements[-1]:
+ if formattingElement != self.tree.openElements[-1]:
self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
# Step 2
# Start of the adoption agency algorithm proper
- afeIndex = self.tree.openElements.index(afeElement)
+ afeIndex = self.tree.openElements.index(formattingElement)
furthestBlock = None
for element in self.tree.openElements[afeIndex:]:
if (element.nameTuple in
@@ -1382,7 +1377,7 @@ def endTagFormatting(self, token):
# Step 3
if furthestBlock is None:
element = self.tree.openElements.pop()
- while element != afeElement:
+ while element != formattingElement:
element = self.tree.openElements.pop()
self.tree.activeFormattingElements.remove(element)
return
@@ -1397,7 +1392,7 @@ def endTagFormatting(self, token):
# nodes in step 12. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 7.4
- bookmark = self.tree.activeFormattingElements.index(afeElement)
+ bookmark = self.tree.activeFormattingElements.index(formattingElement)
# Step 6
lastNode = node = furthestBlock
@@ -1412,7 +1407,7 @@ def endTagFormatting(self, token):
self.tree.openElements.index(node)-1]
self.tree.openElements.remove(tmpNode)
# Step 6.3
- if node == afeElement:
+ if node == formattingElement:
break
# Step 6.4
if lastNode == furthestBlock:
@@ -1429,7 +1424,7 @@ def endTagFormatting(self, token):
self.tree.openElements.index(node)] = clone
node = clone
- # Step 7.6
+ # Step 6.6
# Remove lastNode from its parents, if any
if lastNode.parent:
lastNode.parent.removeChild(lastNode)
@@ -1447,7 +1442,7 @@ def endTagFormatting(self, token):
commonAncestor.appendChild(lastNode)
# Step 8
- clone = afeElement.cloneNode()
+ clone = formattingElement.cloneNode()
# Step 9
furthestBlock.reparentChildren(clone)
@@ -1456,11 +1451,11 @@ def endTagFormatting(self, token):
furthestBlock.appendChild(clone)
# Step 11
- self.tree.activeFormattingElements.remove(afeElement)
+ self.tree.activeFormattingElements.remove(formattingElement)
self.tree.activeFormattingElements.insert(bookmark, clone)
# Step 12
- self.tree.openElements.remove(afeElement)
+ self.tree.openElements.remove(formattingElement)
self.tree.openElements.insert(
self.tree.openElements.index(furthestBlock) + 1, clone)
@@ -1498,7 +1493,7 @@ def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
break
-class InCDataRCDataPhase(Phase):
+class TextPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([])
@@ -1518,7 +1513,7 @@ def processEOF(self):
self.parser.phase.processEOF()
def startTagOther(self, token):
- assert False, "Tried to process start tag %s in (R)CDATA mode"%name
+ assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode"%name
def endTagScript(self, token):
node = self.tree.openElements.pop()
@@ -1544,7 +1539,8 @@ def __init__(self, parser, tree):
(("td", "th", "tr"), self.startTagImplyTbody),
("table", self.startTagTable),
(("style", "script"), self.startTagStyleScript),
- ("input", self.startTagInput)
+ ("input", self.startTagInput),
+ ("form", self.startTagForm)
])
self.startTagHandler.default = self.startTagOther
@@ -1638,6 +1634,11 @@ def startTagInput(self, token):
else:
self.startTagOther(token)
+ def startTagForm(self, token):
+ self.parser.parseError("unexpected-form-in-table")
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
if "tainted" not in self.getCurrentTable()._flags:
@@ -1648,7 +1649,7 @@ def startTagOther(self, token):
self.tree.insertFromTable = False
def endTagTable(self, token):
- if self.tree.elementInScope("table", True):
+ if self.tree.elementInScope("table", variant="table"):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "table":
self.parser.parseError("end-tag-too-early-named",
@@ -1695,10 +1696,10 @@ def processComment(self, token):
self.phase = self.originalPhase
self.phase.processComment(token)
- def processEOF(self, token):
+ def processEOF(self):
self.flushCharacters()
self.phase = self.originalPhase
- self.phase.processEOF(token)
+ self.phase.processEOF()
def processCharacters(self, token):
self.characterTokens.append(token)
@@ -1740,7 +1741,7 @@ def __init__(self, parser, tree):
self.endTagHandler.default = self.endTagOther
def ignoreEndTagCaption(self):
- return not self.tree.elementInScope("caption", True)
+ return not self.tree.elementInScope("caption", variant="table")
def processEOF(self):
self.parser.phases["inBody"].processEOF()
@@ -1911,9 +1912,9 @@ def startTagTableCell(self, token):
def startTagTableOther(self, token):
# XXX AT Any ideas on how to share this with endTagTable?
- if (self.tree.elementInScope("tbody", True) or
- self.tree.elementInScope("thead", True) or
- self.tree.elementInScope("tfoot", True)):
+ if (self.tree.elementInScope("tbody", variant="table") or
+ self.tree.elementInScope("thead", variant="table") or
+ self.tree.elementInScope("tfoot", variant="table")):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(
impliedTagToken(self.tree.openElements[-1].name))
@@ -1926,7 +1927,7 @@ def startTagOther(self, token):
self.parser.phases["inTable"].processStartTag(token)
def endTagTableRowGroup(self, token):
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.clearStackToTableBodyContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
@@ -1935,9 +1936,9 @@ def endTagTableRowGroup(self, token):
{"name": token["name"]})
def endTagTable(self, token):
- if (self.tree.elementInScope("tbody", True) or
- self.tree.elementInScope("thead", True) or
- self.tree.elementInScope("tfoot", True)):
+ if (self.tree.elementInScope("tbody", variant="table") or
+ self.tree.elementInScope("thead", variant="table") or
+ self.tree.elementInScope("tfoot", variant="table")):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(
impliedTagToken(self.tree.openElements[-1].name))
@@ -1983,7 +1984,7 @@ def clearStackToTableRowContext(self):
self.tree.openElements.pop()
def ignoreEndTagTr(self):
- return not self.tree.elementInScope("tr", tableVariant=True)
+ return not self.tree.elementInScope("tr", variant="table")
# the rest
def processEOF(self):
@@ -2030,7 +2031,7 @@ def endTagTable(self, token):
self.parser.phase.processEndTag(token)
def endTagTableRowGroup(self, token):
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.endTagTr("tr")
self.parser.phase.processEndTag(token)
else:
@@ -2064,9 +2065,9 @@ def __init__(self, parser, tree):
# helper
def closeCell(self):
- if self.tree.elementInScope("td", True):
+ if self.tree.elementInScope("td", variant="table"):
self.endTagTableCell(impliedTagToken("td"))
- elif self.tree.elementInScope("th", True):
+ elif self.tree.elementInScope("th", variant="table"):
self.endTagTableCell(impliedTagToken("th"))
# the rest
@@ -2077,8 +2078,8 @@ def processCharacters(self, token):
self.parser.phases["inBody"].processCharacters(token)
def startTagTableOther(self, token):
- if (self.tree.elementInScope("td", True) or
- self.tree.elementInScope("th", True)):
+ if (self.tree.elementInScope("td", variant="table") or
+ self.tree.elementInScope("th", variant="table")):
self.closeCell()
self.parser.phase.processStartTag(token)
else:
@@ -2093,7 +2094,7 @@ def startTagOther(self, token):
self.parser.phases["inBody"].processStartTag
def endTagTableCell(self, token):
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.tree.generateImpliedEndTags(token["name"])
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("unexpected-cell-end-tag",
@@ -2113,7 +2114,7 @@ def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagImply(self, token):
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.closeCell()
self.parser.phase.processEndTag(token)
else:
@@ -2178,7 +2179,7 @@ def startTagSelect(self, token):
def startTagInput(self, token):
self.parser.parseError("unexpected-input-in-select")
- if self.tree.elementInScope("select", True):
+ if self.tree.elementInScope("select", variant="table"):
self.endTagSelect("select")
self.parser.phase.processStartTag(token)
@@ -2207,7 +2208,7 @@ def endTagOptgroup(self, token):
{"name": "optgroup"})
def endTagSelect(self, token):
- if self.tree.elementInScope("select", True):
+ if self.tree.elementInScope("select", variant="table"):
node = self.tree.openElements.pop()
while node.name != "select":
node = self.tree.openElements.pop()
@@ -2219,7 +2220,7 @@ def endTagSelect(self, token):
def endTagTableElements(self, token):
self.parser.parseError("unexpected-end-tag-in-select",
{"name": token["name"]})
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.endTagSelect("select")
self.parser.phase.processEndTag(token)
@@ -2260,7 +2261,7 @@ def startTagOther(self, token):
def endTagTable(self, token):
self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
- if self.tree.elementInScope(token["name"], tableVariant=True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.endTagOther(impliedTagToken("select"))
self.parser.phase.processEndTag(token)
51 planet/vendor/html5lib/ihatexml.py
View
@@ -72,44 +72,38 @@ def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
- rv.append(intToUnicodeStr(item[0]))
+ rv.append(escapeRegexp(unichr(item[0])))
else:
- rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
- return "[%s]"%"|".join(rv)
+ rv.append(escapeRegexp(unichr(item[0])) + "-" +
+ escapeRegexp(unichr(item[1])))
+ return "[%s]"%"".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
-def intToUnicodeStr(intValue):
- #There must be a better (non-evil) way to do this
- return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
-
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
- string = string.replace(char, r"\\" + char)
+ string = string.replace(char, "\\" + char)
if char in string:
print string
return string
#output from the above
-nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+
+nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
- def __init__(self, replaceChars = None,
- replaceRanges = None,
+ def __init__(self, replaceChars = None,
dropXmlnsLocalName = False,
dropXmlnsAttrNs = False,
preventDoubleDashComments = False,
preventDashAtCommentEnd = False,
replaceFormFeedCharacters = True):
- if replaceRanges is not None or replaceChars is not None:
- raise NotImplementedError
- else:
- self.replaceCharsRegexp = nonXmlBMPRegexp
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
@@ -147,14 +141,27 @@ def coerceCharacters(self, data):
return data
def toXmlName(self, name):
- replaceChars = set(self.replaceCharsRegexp.findall(name))
+ nameFirst = name[0]
+ nameRest = name[1:]
+ m = nonXmlNameFirstBMPRegexp.match(nameFirst)
+ if m:
+ nameFirstOutput = self.getReplacementCharacter(nameFirst)
+ else:
+ nameFirstOutput = nameFirst
+
+ nameRestOutput = nameRest
+ replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
- if char in self.replaceCache:
- replacement = self.replaceCache[char]
- else:
- replacement = self.escapeChar(char)
- name = name.replace(char, replacement)
- return name
+ replacement = self.getReplacementCharacter(char)
+ nameRestOutput = nameRestOutput.replace(char, replacement)
+ return nameFirstOutput + nameRestOutput
+
+ def getReplacementCharacter(self, char):
+ if char in self.replaceCache:
+ replacement = self.replaceCache[char]
+ else:
+ replacement = self.escapeChar(char)
+ return replacement
def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)):
97 planet/vendor/html5lib/inputstream.py
View
@@ -5,6 +5,7 @@
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings, ReparseException
+import utils
#Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
@@ -158,7 +159,6 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)
-
self.reset()
def reset(self):
@@ -382,14 +382,9 @@ def characterErrorsUCS2(self, data):
codepoint = ord(match.group())
pos = match.start()
#Pretty sure there should be endianness issues here
- if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
- pos < len(data) - 1 and
- ord(data[pos + 1]) >= 0xDC00 and
- ord(data[pos + 1]) <= 0xDFFF):
+ if utils.isSurrogatePair(data[pos:pos+2]):
#We have a surrogate pair!
- #From a perl manpage
- char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
- (ord(data[pos + 1]) - 0xDC00))
+ char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
@@ -449,6 +444,20 @@ def charsUntil(self, characters, opposite = False):
r = u"".join(rv)
return r
+ def charsUntilEOF(self):
+ """ Returns a string of characters from the stream up to EOF."""
+
+ rv = []
+
+ while True:
+ rv.append(self.chunk[self.chunkOffset:])
+ if not self.readChunk():
+ # Reached EOF
+ break
+
+ r = u"".join(rv)
+ return r
+
def unget(self, char):
# Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget
@@ -471,7 +480,7 @@ class EncodingBytes(str):
If the position is ever greater than the string length then an exception is
raised"""
def __new__(self, value):
- return str.__new__(self, value)
+ return str.__new__(self, value.lower())
def __init__(self, value):
self._position=-1
@@ -539,14 +548,12 @@ def skipUntil(self, chars):
self._position = p
return None
- def matchBytes(self, bytes, lower=False):
+ def matchBytes(self, bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
p = self.position
data = self[p:p+len(bytes)]
- if lower:
- data = data.lower()
rv = data.startswith(bytes)
if rv:
self.position += len(bytes)
@@ -557,6 +564,9 @@ def jumpTo(self, bytes):
a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes)
if newPosition > -1:
+ # XXX: This is ugly, but I can't see a nicer way to fix this.
+ if self._position == -1:
+ self._position = 0
self._position += (newPosition + len(bytes)-1)
return True
else:
@@ -581,7 +591,7 @@ def getEncoding(self):
for byte in self.data:
keepParsing = True
for key, method in methodDispatch:
- if self.data.matchBytes(key, lower=True):
+ if self.data.matchBytes(key):
try:
keepParsing = method()
break
@@ -659,59 +669,59 @@ def getAttribute(self):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
data = self.data
+ # Step 1 (skip chars)
c = data.skip(spaceCharactersBytes | frozenset("/"))
- if c == "<":
- data.previous()
- return None
- elif c == ">" or c is None:
+ # Step 2
+ if c in (">", None):
return None
+ # Step 3
attrName = []
attrValue = []
- spaceFound = False
- #Step 5 attribute name
+ #Step 4 attribute name
while True:
if c == "=" and attrName:
break
elif c in spaceCharactersBytes:
- spaceFound=True
+ #Step 6!
+ c = data.skip()
+ c = data.next()
break
- elif c in ("/", "<", ">"):
+ elif c in ("/", ">"):
return "".join(attrName), ""
elif c in asciiUppercaseBytes:
attrName.append(c.lower())
+ elif c == None:
+ return None
else:
attrName.append(c)
- #Step 6
+ #Step 5
c = data.next()
#Step 7
- if spaceFound:
- c = data.skip()
- #Step 8
- if c != "=":
- data.previous()
- return "".join(attrName), ""
- #XXX need to advance position in both spaces and value case
- #Step 9
+ if c != "=":
+ data.previous()
+ return "".join(attrName), ""
+ #Step 8
data.next()
- #Step 10
+ #Step 9
c = data.skip()
- #Step 11
+ #Step 10
if c in ("'", '"'):
- #11.1
+ #10.1
quoteChar = c
while True:
- #11.3
+ #10.2
c = data.next()
+ #10.3
if c == quoteChar:
data.next()
return "".join(attrName), "".join(attrValue)
- #11.4
+ #10.4
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
- #11.5
+ #10.5
else:
attrValue.append(c)
- elif c in (">", "<"):
+ elif c == ">":
return "".join(attrName), ""
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
@@ -719,12 +729,15 @@ def getAttribute(self):
return None
else:
attrValue.append(c)
+ # Step 11
while True:
c = data.next()
if c in spacesAngleBrackets:
return "".join(attrName), "".join(attrValue)
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
+ elif c is None:
+ return None
else:
attrValue.append(c)
@@ -734,10 +747,6 @@ def __init__(self, data):
self.data = data
def parse(self):
try:
- #Skip to the first ";"
- self.data.jumpTo(";")
- self.data.position += 1
- self.data.skip()
#Check if the attr name is charset
#otherwise return
self.data.jumpTo("charset")
@@ -753,8 +762,10 @@ def parse(self):
quoteMark = self.data.currentByte
self.data.position += 1
oldPosition = self.data.position
- self.data.jumpTo(quoteMark)
- return self.data[oldPosition:self.data.position]
+ if self.data.jumpTo(quoteMark):
+ return self.data[oldPosition:self.data.position]
+ else:
+ return None
else:
#Unquoted value
oldPosition = self.data.position
4 planet/vendor/html5lib/sanitizer.py
View
@@ -152,6 +152,8 @@ def sanitize_token(self, token):
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
+ #remove replacement characters from unescaped characters
+ val_unescaped = val_unescaped.replace(u"\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
@@ -177,7 +179,7 @@ def sanitize_token(self, token):
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
- if token["type"] == tokenTypes["EmptyTag"]:
+ if token["selfClosing"]:
token["data"]=token["data"][:-1] + "/>"
token["type"] = tokenTypes["Characters"]
del token["name"]
68 planet/vendor/html5lib/serializer/htmlserializer.py
View
@@ -8,8 +8,8 @@
_ = gettext.gettext
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
-from html5lib.constants import rcdataElements
-
+from html5lib.constants import rcdataElements, entities, xmlEntities
+from html5lib import utils
from xml.sax.saxutils import escape
spaceCharacters = u"".join(spaceCharacters)
@@ -27,20 +27,33 @@
for k, v in entities.items():
if v != "&" and encode_entity_map.get(v) != k.lower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
- encode_entity_map[v] = k
+ encode_entity_map[ord(v)] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
- for c in exc.object[exc.start:exc.end]:
- e = encode_entity_map.get(c)
+ codepoints = []
+ skip = False
+ for i, c in enumerate(exc.object[exc.start:exc.end]):
+ if skip:
+ skip = False
+ continue
+ index = i + exc.start
+ if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
+ codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
+ skip = True
+ else:
+ codepoint = ord(c)
+ codepoints.append(codepoint)
+ for cp in codepoints:
+ e = encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
- res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
+ res.append("&#x%s;"%(hex(cp)[2:]))
return (u"".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
@@ -54,26 +67,32 @@ def encode(text, encoding):
class HTMLSerializer(object):
+ # attribute quoting options
quote_attr_values = False
quote_char = '"'
use_best_quote_char = True
- minimize_boolean_attributes = True
+ # tag syntax options
+ omit_optional_tags = True
+ minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
+
+ # escaping options
escape_lt_in_attrs = False
escape_rcdata = False
+ resolve_entities = True
+ # miscellaneous options
inject_meta_charset = True
strip_whitespace = False
sanitize = False
- omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
- "escape_rcdata", 'use_trailing_solidus', "sanitize")
+ "escape_rcdata", "resolve_entities", "sanitize")
def __init__(self, **kwargs):
if kwargs.has_key('quote_char'):
@@ -103,7 +122,23 @@ def serialize(self, treewalker, encoding=None):
for token in treewalker:
type = token["type"]
if type == "Doctype":
- doctype = u"<!DOCTYPE %s>" % token["name"]
+ doctype = u"<!DOCTYPE %s" % token["name"]
+
+ if token["publicId"]:
+ doctype += u' PUBLIC "%s"' % token["publicId"]
+ elif token["systemId"]:
+ doctype += u" SYSTEM"
+ if token["systemId"]:
+ if token["systemId"].find(u'"') >= 0:
+ if token["systemId"].find(u"'") >= 0:
+ self.serializeError(_("System identifer contains both single and double quote characters"))
+ quote_char = u"'"
+ else:
+ quote_char = u'"'
+ doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
+
+ doctype += u">"
+
if encoding:
yield doctype.encode(encoding)
else:
@@ -198,6 +233,19 @@ def serialize(self, treewalker, encoding=None):
comment = comment.encode(encoding, unicode_encode_errors)
yield comment
+ elif type == "Entity":
+ name = token["name"]
+ key = name + ";"
+ if not key in entities:
+ self.serializeError(_("Entity %s not recognized" % name))
+ if self.resolve_entities and key not in xmlEntities:
+ data = entities[key]
+ else:
+ data = u"&%s;" % name
+ if encoding:
+ data = data.encode(encoding, unicode_encode_errors)
+ yield data
+
else:
self.serializeError(token["data"])
743 planet/vendor/html5lib/tokenizer.py
View
@@ -9,11 +9,12 @@
except ImportError:
from utils import deque
-from constants import contentModelFlags, spaceCharacters
+from constants import spaceCharacters
from constants import entitiesWindows1252, entities
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
from constants import digits, hexDigits, EOF
from constants import tokenTypes, tagTokenTypes
+from constants import replacementCharacters
from inputstream import HTMLInputStream
@@ -47,7 +48,6 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
self.lowercaseAttrName = lowercaseAttrName
# Setup the initial tokenizer state
- self.contentModelFlag = contentModelFlags["PCDATA"]
self.escapeFlag = False
self.lastFourChars = []
self.state = self.dataState
@@ -96,41 +96,43 @@ def consumeNumberEntity(self, isHex):
# Convert the set of characters consumed to an int.
charAsInt = int("".join(charStack), radix)
- if charAsInt == 13:
+ # Certain characters get replaced with others
+ if charAsInt in replacementCharacters:
+ char = replacementCharacters[charAsInt]
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "incorrect-cr-newline-entity"})
- charAsInt = 10
- elif 127 < charAsInt < 160:
- # If the integer is between 127 and 160 (so 128 and bigger and 159
- # and smaller) we need to do the "windows trick".
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "illegal-windows-1252-entity"})
-
- charAsInt = entitiesWindows1252[charAsInt - 128]
-
- # Certain characters get replaced with U+FFFD
- if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
- or (0x007F <= charAsInt <= 0x009F)
- or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
- or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
- or (0x10FFFF < charAsInt)):
+ "illegal-codepoint-for-numeric-entity",
+ "datavars": {"charAsInt": charAsInt}})
+ elif ((0xD800 <= charAsInt <= 0xDFFF) or
+ (charAsInt > 0x10FFFF)):
char = u"\uFFFD"
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
else:
+ #Should speed up this check somehow (e.g. move the set to a constant)
+ if ((0x0001 <= charAsInt <= 0x0008) or
+ (0x000E <= charAsInt <= 0x001F) or
+ (0x007F <= charAsInt <= 0x009F) or
+ (0xFDD0 <= charAsInt <= 0xFDEF) or
+ charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
+ 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
+ 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
+ 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
+ 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
+ 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
+ 0xFFFFF, 0x10FFFE, 0x10FFFF])):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data":
+ "illegal-codepoint-for-numeric-entity",
+ "datavars": {"charAsInt": charAsInt}})
try:
- # XXX We should have a separate function that does "int" to
- # "unicodestring" conversion since this doesn't always work
- # according to hsivonen. Also, unichr has a limitation of 65535
+ # Try/except needed as UCS-2 Python builds' unichar only works
+ # within the BMP.
char = unichr(charAsInt)
- except:
- try:
- char = eval("u'\\U%08x'" % charAsInt)
- except:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "cant-convert-numeric-entity",
- "datavars": {"charAsInt": charAsInt}})
+ except ValueError:
+ char = eval("u'\\U%08x'" % charAsInt)
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
@@ -146,8 +148,8 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
output = u"&"
charStack = [self.stream.char()]
- if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \
- or (allowedChar is not None and allowedChar == charStack[0]):
+ if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&")
+ or (allowedChar is not None and allowedChar == charStack[0])):
self.stream.unget(charStack[0])
elif charStack[0] == u"#":
@@ -251,43 +253,14 @@ def emitCurrentToken(self):
# Below are the various tokenizer states worked out.
def dataState(self):
- #XXX - consider splitting this state based on the content model flag
data = self.stream.char()
-
- # Keep a charbuffer to handle the escapeFlag
- if (self.contentModelFlag in
- (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
- if len(self.lastFourChars) == 4:
- self.lastFourChars.pop(0)
- self.lastFourChars.append(data)
-
- # The rest of the logic
- if (data == "&" and self.contentModelFlag in
- (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and
- not self.escapeFlag):
+ if data == "&":
self.state = self.entityDataState
- elif (data == "-" and self.contentModelFlag in
- (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
- not self.escapeFlag and "".join(self.lastFourChars) == "<!--"):
- self.escapeFlag = True
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data":data})
- elif (data == "<" and (self.contentModelFlag ==
- contentModelFlags["PCDATA"]
- or (self.contentModelFlag in
- (contentModelFlags["CDATA"],
- contentModelFlags["RCDATA"]) and
- self.escapeFlag == False))):
+ elif data == "<":
self.state = self.tagOpenState
- elif (data == ">" and self.contentModelFlag in
- (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
- self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"):
- self.escapeFlag = False
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data})
elif data is EOF:
# Tokenization ends.
return False
-
elif data in spaceCharacters:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
@@ -298,13 +271,7 @@ def dataState(self):
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else:
- if (self.contentModelFlag in
- (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
- chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
- self.lastFourChars += chars[-4:]
- self.lastFourChars = self.lastFourChars[-4:]
- else:
- chars = self.stream.charsUntil((u"&", u"<"))
+ chars = self.stream.charsUntil((u"&", u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
@@ -313,97 +280,108 @@ def entityDataState(self):
self.consumeEntity()
self.state = self.dataState
return True
-
- def tagOpenState(self):
+
+ def rcdataState(self):
data = self.stream.char()
- if self.contentModelFlag == contentModelFlags["PCDATA"]:
- if data == u"!":
- self.state = self.markupDeclarationOpenState
- elif data == u"/":
- self.state = self.closeTagOpenState
- elif data in asciiLetters:
- self.currentToken = {"type": tokenTypes["StartTag"],
- "name": data, "data": [],
- "selfClosing": False,
- "selfClosingAcknowledged": False}
- self.state = self.tagNameState
- elif data == u">":
- # XXX In theory it could be something besides a tag name. But
- # do we really care?
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name-but-got-right-bracket"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
- self.state = self.dataState
- elif data == u"?":
- # XXX In theory it could be something besides a tag name. But
- # do we really care?
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name-but-got-question-mark"})
- self.stream.unget(data)
- self.state = self.bogusCommentState
- else:
- # XXX
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
- self.stream.unget(data)
- self.state = self.dataState
- else:
- # We know the content model flag is set to either RCDATA or CDATA
- # now because this state can never be entered with the PLAINTEXT
- # flag.
- if data == u"/":
- self.state = self.closeTagOpenState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
- self.stream.unget(data)
- self.state = self.dataState
+ if data == "&":
+ self.state = self.characterReferenceInRcdata
+ elif data == "<":
+ self.state = self.rcdataLessThanSignState
+ elif data == EOF:
+ # Tokenization ends.
+ return False
+ elif data in spaceCharacters:
+ # Directly after emitting a token you switch back to the "data