diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py
index a254a5c6..a9e1c507 100644
--- a/extruct/w3cmicrodata.py
+++ b/extruct/w3cmicrodata.py
@@ -11,6 +11,7 @@
import collections
from functools import partial
+from copy import deepcopy
try:
from urlparse import urljoin
@@ -18,11 +19,32 @@
from urllib.parse import urljoin
import lxml.etree
+from lxml.html.clean import Cleaner
from w3lib.html import strip_html5_whitespace
+import html_text
from extruct.utils import parse_html
+# Cleaner which is similar to html_text cleaner, but is less aggressive
+cleaner = Cleaner(
+ scripts=True,
+ javascript=False, # onclick attributes are fine
+ comments=True,
+ style=True,
+ links=True,
+ meta=True,
+ page_structure=False, #
may be nice to have
+ processing_instructions=True,
+ embedded=False, # keep embedded content
+ frames=False, # keed frames
+ forms=False, # keep forms
+ annoying_tags=False,
+ remove_unknown_tags=False,
+ safe_attrs_only=False,
+)
+
+
class LxmlMicrodataExtractor(object):
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
@@ -49,11 +71,12 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
return self.extract_items(tree, base_url)
def extract_items(self, document, base_url):
+ cleaned_document = cleaner.clean_html(document)
items_seen = set()
return [
item for item in (
self._extract_item(it, items_seen=items_seen, base_url=base_url)
- for it in self._xp_item(document))
+ for it in self._xp_item(cleaned_document))
if item]
def _extract_item(self, node, items_seen, base_url):
@@ -182,7 +205,7 @@ def _extract_property_value(self, node, items_seen, base_url, force=False):
return self._extract_textContent(node)
def _extract_textContent(self, node):
- return u"".join(self._xp_clean_text(node)).strip()
+ return html_text.etree_to_text(node)
MicrodataExtractor = LxmlMicrodataExtractor
diff --git a/requirements.txt b/requirements.txt
index 87a27224..04bc5c16 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ rdflib-jsonld
mf2py>=1.1.0
six
w3lib
+html-text
diff --git a/setup.py b/setup.py
index 5bbc7553..f7e60387 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@ def get_version():
'rdflib-jsonld',
'mf2py',
'w3lib',
+ 'html-text>=0.5.1',
'six'],
extras_require={
'service': [