diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index a254a5c6..a9e1c507 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -11,6 +11,7 @@ import collections from functools import partial +from copy import deepcopy try: from urlparse import urljoin @@ -18,11 +19,32 @@ from urllib.parse import urljoin import lxml.etree +from lxml.html.clean import Cleaner from w3lib.html import strip_html5_whitespace +import html_text from extruct.utils import parse_html +# Cleaner which is similar to html_text cleaner, but is less aggressive +cleaner = Cleaner( + scripts=True, + javascript=False, # onclick attributes are fine + comments=True, + style=True, + links=True, + meta=True, + page_structure=False, # may be nice to have + processing_instructions=True, + embedded=False, # keep embedded content + frames=False, # keed frames + forms=False, # keep forms + annoying_tags=False, + remove_unknown_tags=False, + safe_attrs_only=False, +) + + class LxmlMicrodataExtractor(object): _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]') _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop], @@ -49,11 +71,12 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"): return self.extract_items(tree, base_url) def extract_items(self, document, base_url): + cleaned_document = cleaner.clean_html(document) items_seen = set() return [ item for item in ( self._extract_item(it, items_seen=items_seen, base_url=base_url) - for it in self._xp_item(document)) + for it in self._xp_item(cleaned_document)) if item] def _extract_item(self, node, items_seen, base_url): @@ -182,7 +205,7 @@ def _extract_property_value(self, node, items_seen, base_url, force=False): return self._extract_textContent(node) def _extract_textContent(self, node): - return u"".join(self._xp_clean_text(node)).strip() + return html_text.etree_to_text(node) MicrodataExtractor = LxmlMicrodataExtractor diff --git a/requirements.txt b/requirements.txt index 87a27224..04bc5c16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ rdflib-jsonld mf2py>=1.1.0 six w3lib +html-text diff --git a/setup.py b/setup.py index 5bbc7553..f7e60387 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def get_version(): 'rdflib-jsonld', 'mf2py', 'w3lib', + 'html-text>=0.5.1', 'six'], extras_require={ 'service': [