Permalink
Browse files

Update to the lastest html5lib; replace feedparser's sanitizer with

html5lib's
  • Loading branch information...
1 parent 63fa05e commit 6f0f23dd36266ddbfa3662b9ba95b79e4c588584 Sam Ruby committed Sep 9, 2009
Showing with 4,864 additions and 2,382 deletions.
  1. +2 −2 planet/reconstitute.py
  2. +8 −2 planet/scrub.py
  3. +3 −2 planet/vendor/html5lib/__init__.py
  4. +622 −306 planet/vendor/html5lib/constants.py
  5. +127 −0 planet/vendor/html5lib/filters/formfiller.py
  6. +40 −13 planet/vendor/html5lib/filters/optionaltags.py
  7. +8 −0 planet/vendor/html5lib/filters/sanitizer.py
  8. +1,484 −845 planet/vendor/html5lib/html5parser.py
  9. +170 −0 planet/vendor/html5lib/ihatexml.py
  10. +409 −233 planet/vendor/html5lib/inputstream.py
  11. +0 −147 planet/vendor/html5lib/liberalxmlparser.py
  12. +42 −16 planet/vendor/html5lib/sanitizer.py
  13. +14 −0 planet/vendor/html5lib/serializer/__init__.py
  14. +1 −1 planet/vendor/html5lib/serializer/htmlserializer.py
  15. +616 −454 planet/vendor/html5lib/tokenizer.py
  16. +21 −7 planet/vendor/html5lib/treebuilders/__init__.py
  17. +51 −34 planet/vendor/html5lib/treebuilders/_base.py
  18. +272 −183 planet/vendor/html5lib/treebuilders/dom.py
  19. +82 −18 planet/vendor/html5lib/treebuilders/etree.py
  20. +331 −0 planet/vendor/html5lib/treebuilders/etree_lxml.py
  21. +35 −8 planet/vendor/html5lib/treebuilders/simpletree.py
  22. +85 −22 planet/vendor/html5lib/treebuilders/soup.py
  23. +23 −15 planet/vendor/html5lib/treewalkers/_base.py
  24. +2 −1 planet/vendor/html5lib/treewalkers/dom.py
  25. +71 −53 planet/vendor/html5lib/treewalkers/etree.py
  26. +13 −10 planet/vendor/html5lib/treewalkers/genshistream.py
  27. +175 −0 planet/vendor/html5lib/treewalkers/lxmletree.py
  28. +8 −4 planet/vendor/html5lib/treewalkers/pulldom.py
  29. +2 −2 planet/vendor/html5lib/treewalkers/simpletree.py
  30. +26 −3 planet/vendor/html5lib/treewalkers/soup.py
  31. +120 −0 planet/vendor/html5lib/utils.py
  32. +1 −1 tests/data/reconstitute/content_illegal_char.xml
View
@@ -16,7 +16,7 @@
import re, time, sgmllib
from xml.sax.saxutils import escape
from xml.dom import minidom, Node
-from html5lib import liberalxmlparser
+from html5lib import html5parser
from html5lib.treebuilders import dom
import planet, config
@@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo):
bozo=1
if detail.type.find('xhtml')<0 or bozo:
- parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
+ parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
html = parser.parse(xdiv % detail.value, encoding="utf-8")
for body in html.documentElement.childNodes:
if body.nodeType != Node.ELEMENT_NODE: continue
View
@@ -128,5 +128,11 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
- node['value'] = feedparser._sanitizeHTML(
- node.value, 'utf-8', node.type)
+ # Run this through HTML5's serializer
+ from html5lib import html5parser, sanitizer, treewalkers, serializer
+ p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
+ doc = p.parseFragment(node.value, encoding='utf-8')
+ walker = treewalkers.getTreeWalker('simpletree')
+ xhtml = serializer.XHTMLSerializer()
+ tree = xhtml.serialize(walker(doc), encoding='utf-8')
+ node['value'] = ''.join([n for n in tree])
@@ -11,5 +11,6 @@
p = html5lib.HTMLParser()
tree = p.parse(f)
"""
-from html5parser import HTMLParser
-from liberalxmlparser import XMLParser, XHTMLParser
+from html5parser import HTMLParser, parse
+from treebuilders import getTreeBuilder
+from serializer import serialize
Oops, something went wrong.

0 comments on commit 6f0f23d

Please sign in to comment.