Skip to content

Commit

Permalink
Merge pull request #105 from ivanprado/master
Browse files Browse the repository at this point in the history
Avoid including itemprop from child itemscopes when using itemref
  • Loading branch information
lopuhin committed Feb 14, 2019
2 parents 3ab5592 + e655b0a commit b99b14d
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 9 deletions.
33 changes: 25 additions & 8 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
"""

import collections
from functools import partial

try:
from urlparse import urljoin
except ImportError:
Expand Down Expand Up @@ -77,18 +79,18 @@ def _extract_item(self, node, items_seen, base_url):
item["id"] = itemid.strip()

properties = collections.defaultdict(list)
# start with item references
for name, value in self._extract_properties(
node, items_seen=items_seen, base_url=base_url):
properties[name].append(value)

# process item references
refs = node.get('itemref', '').split()
if refs:
for refid in refs:
for name, value in self._extract_property_refs(
node, refid, items_seen=items_seen, base_url=base_url):
properties[name].append(value)

for name, value in self._extract_properties(
node, items_seen=items_seen, base_url=base_url):
properties[name].append(value)

props = []
for (name, values) in properties.items():
if not self.strict and len(values) == 1:
Expand Down Expand Up @@ -119,10 +121,25 @@ def _extract_properties(self, node, items_seen, base_url):
yield p, v

def _extract_property_refs(self, node, refid, items_seen, base_url):
for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid):
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url):
ref_node = node.xpath("id($refid)[1]", refid=refid)
if not ref_node:
return
ref_node = ref_node[0]
extract_fn = partial(self._extract_property, items_seen=items_seen,
base_url=base_url)
if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys():
# An full item will be extracted from the node, no need to look
# for individual properties in childs
for p, v in extract_fn(ref_node):
yield p, v
else:
base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]")
for prop in ref_node.xpath("descendant-or-self::*[@itemprop]"):
parent_scope = prop.xpath("ancestor::*[@itemscope][1]")
# Skip properties defined in a different scope than the ref_node
if parent_scope == base_parent_scope:
for p, v in extract_fn(prop):
yield p, v

def _extract_property(self, node, items_seen, base_url):
props = node.get("itemprop").split()
Expand Down
55 changes: 55 additions & 0 deletions tests/samples/schema.org/product-ref.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
<!DOCTYPE HTML>
<html>
<head>
<title>Photo gallery</title>
</head>
<body>

<div id="product" itemscope itemtype="http://schema.org/Product" itemref="referenced-product more-properties related_products non-existing-ref">
<span itemprop="brand">ACME</span>
<span itemprop="name">Executive Anvil</span>
<img itemprop="image" src=" anvil_executive.jpg" alt="Executive Anvil logo"/>
<span itemprop="description">Sleeker than ACME's Classic Anvil, the
Executive Anvil is perfect for the business traveler
looking for something to drop from a height.
</span>
Product #: <span itemprop="mpn">925872</span>
<span id="aggregateRating" itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
</span> reviews
</span>

<span id="offer" itemprop="offers" itemscope itemtype="http://schema.org/Offer">
Regular price: $179.99
<meta itemprop="priceCurrency" content="USD" />
$<span itemprop="price">119.99 </span>
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
5 November!</time>)
Available from: <span id="organization" itemprop="seller" itemscope itemtype="http://schema.org/Organization">
<span itemprop="name">Executive Objects</span>
</span>
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,
in excellent condition
<link itemprop="availability" href=" http://schema.org/InStock"/>In stock! Order now!
</span>
</div>
<div id="referenced-product" itemscope itemtype="http://schema.org/Product" itemprop="referenced_product">
<span itemprop="name">REFERENCED PRODUCT</span>
<img itemprop="image" src="img-ref.jpg">
</div>
<div id="more-properties" itemscope itemtype="http://schema.org/Product">
<span itemprop="prop3">REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT</span>
<img itemprop="image" src="img-2.jpg">
</div>
<div id="related_products">
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
<span itemprop="name">REL PROD 1</span>
<img itemprop="image" src="rel-prod-1.jpg">
</div>
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
<span itemprop="name">REL PROD 2</span>
<img itemprop="image" src="rel-prod-2.jpg">
</div>
</div>
</body>
</html>
69 changes: 69 additions & 0 deletions tests/samples/schema.org/product-ref.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
[
{
"type": "http://schema.org/Product",
"properties": {
"referenced_product": {
"type": "http://schema.org/Product",
"properties": {
"name": "REFERENCED PRODUCT",
"image": "img-ref.jpg"
}
},
"prop3": "REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT",
"image": [
"anvil_executive.jpg",
"img-2.jpg"
],
"related_products": [
{
"type": "http://schema.org/Product",
"properties": {
"name": "REL PROD 1",
"image": "rel-prod-1.jpg"
}
},
{
"type": "http://schema.org/Product",
"properties": {
"name": "REL PROD 2",
"image": "rel-prod-2.jpg"
}
}
],
"brand": "ACME",
"name": "Executive Anvil",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {
"type": "http://schema.org/AggregateRating",
"properties": {
"ratingValue": "4.4",
"reviewCount": "89"
}
},
"offers": {
"type": "http://schema.org/Offer",
"properties": {
"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "2020-11-05",
"seller": {
"type": "http://schema.org/Organization",
"properties": {
"name": "Executive Objects"
}
},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"
}
}
}
},
{
"type": "http://schema.org/Product",
"properties": {
"prop3": "REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT",
"image": "img-2.jpg"
}
}
]
2 changes: 1 addition & 1 deletion tests/samples/w3c/microdata.5.3.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
{"properties": {"a": ["1", "2"], "b": ["test"]}},
{"properties": {"a": ["1", "2"], "b": ["test"]}},
{"properties": {"a": ["1", "2"], "b": ["test"]}},
{"properties": {"a": ["1", "2"], "b": ["test"]}}
{"properties": {"a": ["2", "1"], "b": ["test"]}}
]

13 changes: 13 additions & 0 deletions tests/test_microdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,16 @@ def test_join_custom_url(self):
mde = MicrodataExtractor()
data = mde.extract(body, base_url='http://some-example.com')
self.assertEqual(data, expected)


class TestItemref(unittest.TestCase):

maxDiff = None

def test_join_none(self):
body = get_testdata('schema.org', 'product-ref.html')
expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))

mde = MicrodataExtractor()
data = mde.extract(body)
self.assertEqual(data, expected)

0 comments on commit b99b14d

Please sign in to comment.