Skip to content

Commit

Permalink
resync with html5lib and feedparser
Browse files Browse the repository at this point in the history
  • Loading branch information
rubys committed May 12, 2010
1 parent 1bcee5c commit 77970db
Show file tree
Hide file tree
Showing 27 changed files with 1,254 additions and 638 deletions.
8 changes: 6 additions & 2 deletions planet/reconstitute.py
Expand Up @@ -25,7 +25,7 @@
except:
from md5 import new as md5

illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]", re.UNICODE)

def createTextElement(parent, name, value):
""" utility function to create a child element with the specified text"""
Expand All @@ -35,6 +35,7 @@ def createTextElement(parent, name, value):
value=value.decode('utf-8')
except:
value=value.decode('iso-8859-1')
value = illegal_xml_chars.sub(invalidate, value)
xdoc = parent.ownerDocument
xelement = xdoc.createElement(name)
xelement.appendChild(xdoc.createTextNode(value))
Expand All @@ -43,7 +44,7 @@ def createTextElement(parent, name, value):

def invalidate(c):
""" replace invalid characters """
return '<acronym title="U+%s">\xef\xbf\xbd</acronym>' % \
return u'<abbr title="U+%s">\ufffd</abbr>' % \
('000' + hex(ord(c.group(0)))[2:])[-4:]

def ncr2c(value):
Expand Down Expand Up @@ -177,6 +178,9 @@ def content(xentry, name, detail, bozo):
if len(div.childNodes) == 1 and \
div.firstChild.nodeType == Node.TEXT_NODE:
data = div.firstChild
if illegal_xml_chars.search(data.data):
data = xdoc.createTextNode(
illegal_xml_chars.sub(invalidate, data.data))
else:
data = div
xcontent.setAttribute('type', 'xhtml')
Expand Down
25 changes: 18 additions & 7 deletions planet/scrub.py
Expand Up @@ -128,13 +128,24 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)

# Run this through HTML5's serializer
from html5lib import html5parser, sanitizer, treebuilders
# Run this through HTML5's sanitizer
doc = None
if 'xhtml' in node['type']:
try:
from xml.dom import minidom
doc = minidom.parseString(node['value'])
except:
node['type']='text/html'

if not doc:
from html5lib import html5parser, treebuilders
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node['value'], encoding='utf-8')

from html5lib import treewalkers, serializer
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node.value, encoding='utf-8')
from html5lib.filters import sanitizer
walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
walker = treewalkers.getTreeWalker('dom')
tree = xhtml.serialize(walker(doc), encoding='utf-8')
tree = xhtml.serialize(walker, encoding='utf-8')

node['value'] = ''.join([str(token) for token in tree])
5 changes: 4 additions & 1 deletion planet/vendor/feedparser.py
Expand Up @@ -1595,9 +1595,12 @@ def __init__(self, baseuri, baselang, encoding):
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
self.decls = {}

def startPrefixMapping(self, prefix, uri):
self.trackNamespace(prefix, uri)
if uri == 'http://www.w3.org/1999/xlink':
self.decls['xmlns:'+prefix] = uri

def startElementNS(self, name, qname, attrs):
namespace, localname = name
Expand All @@ -1622,7 +1625,7 @@ def startElementNS(self, name, qname, attrs):
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
attrsD = {}
attrsD, self.decls = self.decls, {}
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
attrsD['xmlns']=namespace
if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
Expand Down
7 changes: 4 additions & 3 deletions planet/vendor/html5lib/__init__.py
Expand Up @@ -8,9 +8,10 @@
import html5lib
f = open("my_document.html")
p = html5lib.HTMLParser()
tree = p.parse(f)
tree = html5lib.parse(f)
"""
from html5parser import HTMLParser, parse
__version__ = "%(version)s"
from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder
from treewalkers import getTreeWalker
from serializer import serialize
53 changes: 46 additions & 7 deletions planet/vendor/html5lib/constants.py
Expand Up @@ -180,6 +180,8 @@
u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."),
"unexpected-form-in-table":
_(u"Unexpected form in table context."),
"unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."),
Expand Down Expand Up @@ -256,21 +258,18 @@
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
"eof-in-script-in-script":
_(u"Unexpected end of file. Expected script content."),
"non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"),
"unexpected-end-tag-before-html":
_(u"Unexpected end tag (%(name)s) before html."),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}

contentModelFlags = {
"PCDATA":0,
"RCDATA":1,
"CDATA":2,
"PLAINTEXT":3
}

namespaces = {
"html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML",
Expand Down Expand Up @@ -509,6 +508,8 @@
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
)

xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))

entities = {
"AElig;": u"\u00C6",
"AElig": u"\u00C6",
Expand Down Expand Up @@ -878,6 +879,44 @@
"zwnj;": u"\u200C"
}

replacementCharacters = {
0x0:u"\uFFFD",
0x0d:u"\u000A",
0x80:u"\u20AC",
0x81:u"\u0081",
0x81:u"\u0081",
0x82:u"\u201A",
0x83:u"\u0192",
0x84:u"\u201E",
0x85:u"\u2026",
0x86:u"\u2020",
0x87:u"\u2021",
0x88:u"\u02C6",
0x89:u"\u2030",
0x8A:u"\u0160",
0x8B:u"\u2039",
0x8C:u"\u0152",
0x8D:u"\u008D",
0x8E:u"\u017D",
0x8F:u"\u008F",
0x90:u"\u0090",
0x91:u"\u2018",
0x92:u"\u2019",
0x93:u"\u201C",
0x94:u"\u201D",
0x95:u"\u2022",
0x96:u"\u2013",
0x97:u"\u2014",
0x98:u"\u02DC",
0x99:u"\u2122",
0x9A:u"\u0161",
0x9B:u"\u203A",
0x9C:u"\u0153",
0x9D:u"\u009D",
0x9E:u"\u017E",
0x9F:u"\u0178",
}

encodings = {
'437': 'cp437',
'850': 'cp850',
Expand Down

0 comments on commit 77970db

Please sign in to comment.