Skip to content

Commit

Permalink
Merge pull request #148 from scrapinghub/microdata-fast-doc-id
Browse files Browse the repository at this point in the history
Fast get_docid for microdata parser (fixes GH-147)
  • Loading branch information
ivanprado committed Aug 31, 2020
2 parents 205ee73 + 6f2e2d2 commit bf8219b
Showing 1 changed file with 32 additions and 26 deletions.
58 changes: 32 additions & 26 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,40 +45,44 @@


class LxmlMicrodataExtractor(object):
# iterate in document order (used below for fast get_docid)
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
.//*[@itemscope]//*[@itemprop])""",
namespaces = {"set": "http://exslt.org/sets"})
_xp_clean_text = lxml.etree.XPath('descendant-or-self::*[not(self::script or self::style)]/text()')
# ancestor and preceding axes contain all elements before the context node
# so counting them gives the "document order" of the context node
_xp_item_docid = lxml.etree.XPath("""count(preceding::*[@itemscope])
+ count(ancestor::*[@itemscope])
+ 1""")

def __init__(self, nested=True, strict=False, add_text_content=False, add_html_node=False):
self.nested = nested
self.strict = strict
self.add_text_content = add_text_content
self.add_html_node = add_html_node

def get_docid(self, node):
return int(self._xp_item_docid(node))

def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
tree = parse_html(htmlstring, encoding=encoding)
return self.extract_items(tree, base_url)

def extract_items(self, document, base_url):
itemids = self._build_itemids(document)
items_seen = set()
return [
item for item in (
self._extract_item(it, items_seen=items_seen, base_url=base_url)
self._extract_item(
it, items_seen=items_seen, base_url=base_url, itemids=itemids)
for it in self._xp_item(document))
if item]

def _extract_item(self, node, items_seen, base_url):
itemid = self.get_docid(node)
def get_docid(self, node, itemids):
return itemids[node]

def _build_itemids(self, document):
""" Build itemids for a fast get_docid implementation. Use document order.
"""
root = document.getroottree().getroot()
return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))}

def _extract_item(self, node, items_seen, base_url, itemids):
itemid = self.get_docid(node, itemids)

if self.nested:
if itemid in items_seen:
Expand All @@ -95,21 +99,22 @@ def _extract_item(self, node, items_seen, base_url):
else:
item["type"] = types

itemid = node.get('itemid')
if itemid:
item["id"] = itemid.strip()
nodeid = node.get('itemid')
if nodeid:
item["id"] = nodeid.strip()

properties = collections.defaultdict(list)
for name, value in self._extract_properties(
node, items_seen=items_seen, base_url=base_url):
node, items_seen=items_seen, base_url=base_url, itemids=itemids):
properties[name].append(value)

# process item references
refs = node.get('itemref', '').split()
if refs:
for refid in refs:
for name, value in self._extract_property_refs(
node, refid, items_seen=items_seen, base_url=base_url):
node, refid, items_seen=items_seen, base_url=base_url,
itemids=itemids):
properties[name].append(value)

props = []
Expand All @@ -123,7 +128,8 @@ def _extract_item(self, node, items_seen, base_url):
else:
# item without properties; let's use the node itself
item["value"] = self._extract_property_value(
node, force=True, items_seen=items_seen, base_url=base_url)
node, force=True, items_seen=items_seen, base_url=base_url,
itemids=itemids)

# below are not in the specs, but can be handy
if self.add_text_content:
Expand All @@ -135,19 +141,19 @@ def _extract_item(self, node, items_seen, base_url):

return item

def _extract_properties(self, node, items_seen, base_url):
def _extract_properties(self, node, items_seen, base_url, itemids):
for prop in self._xp_prop(node):
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url):
prop, items_seen=items_seen, base_url=base_url, itemids=itemids):
yield p, v

def _extract_property_refs(self, node, refid, items_seen, base_url):
def _extract_property_refs(self, node, refid, items_seen, base_url, itemids):
ref_node = node.xpath("id($refid)[1]", refid=refid)
if not ref_node:
return
ref_node = ref_node[0]
extract_fn = partial(self._extract_property, items_seen=items_seen,
base_url=base_url)
base_url=base_url, itemids=itemids)
if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys():
# An full item will be extracted from the node, no need to look
# for individual properties in child nodes
Expand All @@ -162,20 +168,20 @@ def _extract_property_refs(self, node, refid, items_seen, base_url):
for p, v in extract_fn(prop):
yield p, v

def _extract_property(self, node, items_seen, base_url):
def _extract_property(self, node, items_seen, base_url, itemids):
props = node.get("itemprop").split()
value = self._extract_property_value(
node, items_seen=items_seen, base_url=base_url)
node, items_seen=items_seen, base_url=base_url, itemids=itemids)
return [(p, value) for p in props]

def _extract_property_value(self, node, items_seen, base_url, force=False):
def _extract_property_value(self, node, items_seen, base_url, itemids, force=False):
#http://www.w3.org/TR/microdata/#values
if not force and node.get("itemscope") is not None:
if self.nested:
return self._extract_item(
node, items_seen=items_seen, base_url=base_url)
node, items_seen=items_seen, base_url=base_url, itemids=itemids)
else:
return {"iid_ref": self.get_docid(node)}
return {"iid_ref": self.get_docid(node, itemids)}

elif node.tag == "meta":
return node.get("content", "")
Expand Down

0 comments on commit bf8219b

Please sign in to comment.