Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,40 @@

import collections
from functools import partial
from copy import deepcopy

try:
from urlparse import urljoin
except ImportError:
from urllib.parse import urljoin

import lxml.etree
from lxml.html.clean import Cleaner
from w3lib.html import strip_html5_whitespace
import html_text

from extruct.utils import parse_html


# Cleaner which is similar to html_text cleaner, but is less aggressive
cleaner = Cleaner(
scripts=True,
javascript=False, # onclick attributes are fine
comments=True,
style=True,
links=True,
meta=True,
page_structure=False, # <title> may be nice to have
processing_instructions=True,
embedded=False, # keep embedded content
frames=False, # keed frames
forms=False, # keep forms
annoying_tags=False,
remove_unknown_tags=False,
safe_attrs_only=False,
)


class LxmlMicrodataExtractor(object):
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
Expand All @@ -49,11 +71,12 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
return self.extract_items(tree, base_url)

def extract_items(self, document, base_url):
cleaned_document = cleaner.clean_html(document)
items_seen = set()
return [
item for item in (
self._extract_item(it, items_seen=items_seen, base_url=base_url)
for it in self._xp_item(document))
for it in self._xp_item(cleaned_document))
if item]

def _extract_item(self, node, items_seen, base_url):
Expand Down Expand Up @@ -182,7 +205,7 @@ def _extract_property_value(self, node, items_seen, base_url, force=False):
return self._extract_textContent(node)

def _extract_textContent(self, node):
return u"".join(self._xp_clean_text(node)).strip()
return html_text.etree_to_text(node)


MicrodataExtractor = LxmlMicrodataExtractor
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ rdflib-jsonld
mf2py>=1.1.0
six
w3lib
html-text
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def get_version():
'rdflib-jsonld',
'mf2py',
'w3lib',
'html-text>=0.5.1',
'six'],
extras_require={
'service': [
Expand Down