Permalink
Browse files

resync with html5lib and feedparser

  • Loading branch information...
1 parent 1bcee5c commit 77970dbaaa7edf25644726b4500633a1d0fd9de2 @rubys committed May 12, 2010
View
@@ -25,7 +25,7 @@
except:
from md5 import new as md5
-illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
+illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]", re.UNICODE)
def createTextElement(parent, name, value):
""" utility function to create a child element with the specified text"""
@@ -35,6 +35,7 @@ def createTextElement(parent, name, value):
value=value.decode('utf-8')
except:
value=value.decode('iso-8859-1')
+ value = illegal_xml_chars.sub(invalidate, value)
xdoc = parent.ownerDocument
xelement = xdoc.createElement(name)
xelement.appendChild(xdoc.createTextNode(value))
@@ -43,7 +44,7 @@ def createTextElement(parent, name, value):
def invalidate(c):
""" replace invalid characters """
- return '<acronym title="U+%s">\xef\xbf\xbd</acronym>' % \
+ return u'<abbr title="U+%s">\ufffd</abbr>' % \
('000' + hex(ord(c.group(0)))[2:])[-4:]
def ncr2c(value):
@@ -177,6 +178,9 @@ def content(xentry, name, detail, bozo):
if len(div.childNodes) == 1 and \
div.firstChild.nodeType == Node.TEXT_NODE:
data = div.firstChild
+ if illegal_xml_chars.search(data.data):
+ data = xdoc.createTextNode(
+ illegal_xml_chars.sub(invalidate, data.data))
else:
data = div
xcontent.setAttribute('type', 'xhtml')
View
@@ -128,13 +128,24 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
- # Run this through HTML5's serializer
- from html5lib import html5parser, sanitizer, treebuilders
+ # Run this through HTML5's sanitizer
+ doc = None
+ if 'xhtml' in node['type']:
+ try:
+ from xml.dom import minidom
+ doc = minidom.parseString(node['value'])
+ except:
+ node['type']='text/html'
+
+ if not doc:
+ from html5lib import html5parser, treebuilders
+ p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+ doc = p.parseFragment(node['value'], encoding='utf-8')
+
from html5lib import treewalkers, serializer
- p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
- tree=treebuilders.getTreeBuilder('dom'))
- doc = p.parseFragment(node.value, encoding='utf-8')
+ from html5lib.filters import sanitizer
+ walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
- walker = treewalkers.getTreeWalker('dom')
- tree = xhtml.serialize(walker(doc), encoding='utf-8')
+ tree = xhtml.serialize(walker, encoding='utf-8')
+
node['value'] = ''.join([str(token) for token in tree])
@@ -1595,9 +1595,12 @@ def __init__(self, baseuri, baselang, encoding):
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
+ self.decls = {}
def startPrefixMapping(self, prefix, uri):
self.trackNamespace(prefix, uri)
+ if uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:'+prefix] = uri
def startElementNS(self, name, qname, attrs):
namespace, localname = name
@@ -1622,7 +1625,7 @@ def startElementNS(self, name, qname, attrs):
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
- attrsD = {}
+ attrsD, self.decls = self.decls, {}
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
attrsD['xmlns']=namespace
if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
@@ -8,9 +8,10 @@
import html5lib
f = open("my_document.html")
-p = html5lib.HTMLParser()
-tree = p.parse(f)
+tree = html5lib.parse(f)
"""
-from html5parser import HTMLParser, parse
+__version__ = "%(version)s"
+from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder
+from treewalkers import getTreeWalker
from serializer import serialize
@@ -180,6 +180,8 @@
u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."),
+ "unexpected-form-in-table":
+ _(u"Unexpected form in table context."),
"unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."),
@@ -256,21 +258,18 @@
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
+ "eof-in-script-in-script":
+ _(u"Unexpected end of file. Expected script content."),
"non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"),
+ "unexpected-end-tag-before-html":
+ _(u"Unexpected end tag (%(name)s) before html."),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}
-contentModelFlags = {
- "PCDATA":0,
- "RCDATA":1,
- "CDATA":2,
- "PLAINTEXT":3
-}
-
namespaces = {
"html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML",
@@ -509,6 +508,8 @@
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
)
+xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
+
entities = {
"AElig;": u"\u00C6",
"AElig": u"\u00C6",
@@ -878,6 +879,44 @@
"zwnj;": u"\u200C"
}
+replacementCharacters = {
+ 0x0:u"\uFFFD",
+ 0x0d:u"\u000A",
+ 0x80:u"\u20AC",
+ 0x81:u"\u0081",
+ 0x81:u"\u0081",
+ 0x82:u"\u201A",
+ 0x83:u"\u0192",
+ 0x84:u"\u201E",
+ 0x85:u"\u2026",
+ 0x86:u"\u2020",
+ 0x87:u"\u2021",
+ 0x88:u"\u02C6",
+ 0x89:u"\u2030",
+ 0x8A:u"\u0160",
+ 0x8B:u"\u2039",
+ 0x8C:u"\u0152",
+ 0x8D:u"\u008D",
+ 0x8E:u"\u017D",
+ 0x8F:u"\u008F",
+ 0x90:u"\u0090",
+ 0x91:u"\u2018",
+ 0x92:u"\u2019",
+ 0x93:u"\u201C",
+ 0x94:u"\u201D",
+ 0x95:u"\u2022",
+ 0x96:u"\u2013",
+ 0x97:u"\u2014",
+ 0x98:u"\u02DC",
+ 0x99:u"\u2122",
+ 0x9A:u"\u0161",
+ 0x9B:u"\u203A",
+ 0x9C:u"\u0153",
+ 0x9D:u"\u009D",
+ 0x9E:u"\u017E",
+ 0x9F:u"\u0178",
+}
+
encodings = {
'437': 'cp437',
'850': 'cp850',
Oops, something went wrong.

0 comments on commit 77970db

Please sign in to comment.