Skip to content

Commit

Permalink
Avoiding include properties from child scopes when referenced as `ite…
Browse files Browse the repository at this point in the history
…mref`
  • Loading branch information
ivanprado committed Feb 6, 2019
1 parent 3ab5592 commit c410a1b
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 2 deletions.
15 changes: 13 additions & 2 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,21 @@ def _extract_properties(self, node, items_seen, base_url):
yield p, v

def _extract_property_refs(self, node, refid, items_seen, base_url):
for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid):
ref_node = node.xpath("id($refid)[1]", refid=refid)
if not ref_node:
return
ref_node = ref_node[0]
base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]")
if 'itemprop' in ref_node.keys():
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url):
ref_node, items_seen=items_seen, base_url=base_url):
yield p, v
for prop in ref_node.xpath("descendant::*[@itemprop]"):
parent_scope = prop.xpath("ancestor::*[@itemscope][1]")
if parent_scope == base_parent_scope:
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url):
yield p, v

def _extract_property(self, node, items_seen, base_url):
props = node.get("itemprop").split()
Expand Down
58 changes: 58 additions & 0 deletions tests/samples/schema.org/product-ref.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<!DOCTYPE HTML>
<html>
<head>
<title>Photo gallery</title>
</head>
<body>

<div id="product" itemscope itemtype="http://schema.org/Product" itemref="other-product-properties more-properties related_products">
<span itemprop="brand">ACME</span>
<span itemprop="name">Executive Anvil</span>
<img itemprop="image" src=" anvil_executive.jpg
" alt="Executive Anvil logo" />
<span itemprop="description">Sleeker than ACME's Classic Anvil, the
Executive Anvil is perfect for the business traveler
looking for something to drop from a height.
</span>
Product #: <span itemprop="mpn">925872</span>
<span id="aggregateRating" itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
</span> reviews
</span>

<span id="offer" itemprop="offers" itemscope itemtype="http://schema.org/Offer">
Regular price: $179.99
<meta itemprop="priceCurrency" content="USD" />
$<span itemprop="price">119.99 </span>
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
5 November!</time>)
Available from: <span id="organization" itemprop="seller" itemscope itemtype="http://schema.org/Organization">
<span itemprop="name">Executive Objects</span>
</span>
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,
in excellent condition
<link itemprop="availability" href=" http://schema.org/InStock"/>In stock! Order now!
</span>
</div>
<div id="other-product-properties" itemscope itemtype="http://schema.org/Product" itemprop="referenced_product">
<span itemprop="prop2">PROP 2</span>
<img itemprop="image" src="img-2.jpg">
</div>
<div id="more-properties" itemscope itemtype="http://schema.org/Product">
<span itemprop="prop3">PROP 3</span>
<img itemprop="image" src="img-3.jpg">
</div>
<div id="related_products">
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
<span itemprop="name">REL PROD 1</span>
<img itemprop="image" src="rel-prod-1.jpg">
</div>
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
<span itemprop="name">REL PROD 2</span>
<img itemprop="image" src="rel-prod-2.jpg">
</div>
</div>

</div>
</body>
</html>
71 changes: 71 additions & 0 deletions tests/samples/schema.org/product-ref.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
[
{
"type": "http://schema.org/Product",
"properties": {
"referenced_product": {
"type": "http://schema.org/Product",
"properties": {
"prop2": "PROP 2",
"image": "img-2.jpg"
}
},
"prop2": "PROP 2",
"image": [
"img-2.jpg",
"img-3.jpg",
"anvil_executive.jpg"
],
"prop3": "PROP 3",
"related_products": [
{
"type": "http://schema.org/Product",
"properties": {
"name": "REL PROD 1",
"image": "rel-prod-1.jpg"
}
},
{
"type": "http://schema.org/Product",
"properties": {
"name": "REL PROD 2",
"image": "rel-prod-2.jpg"
}
}
],
"brand": "ACME",
"name": "Executive Anvil",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {
"type": "http://schema.org/AggregateRating",
"properties": {
"ratingValue": "4.4",
"reviewCount": "89"
}
},
"offers": {
"type": "http://schema.org/Offer",
"properties": {
"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "2020-11-05",
"seller": {
"type": "http://schema.org/Organization",
"properties": {
"name": "Executive Objects"
}
},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"
}
}
}
},
{
"type": "http://schema.org/Product",
"properties": {
"prop3": "PROP 3",
"image": "img-3.jpg"
}
}
]
13 changes: 13 additions & 0 deletions tests/test_microdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,16 @@ def test_join_custom_url(self):
mde = MicrodataExtractor()
data = mde.extract(body, base_url='http://some-example.com')
self.assertEqual(data, expected)


class TestItemref(unittest.TestCase):

maxDiff = None

def test_join_none(self):
body = get_testdata('schema.org', 'product-ref.html')
expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))

mde = MicrodataExtractor()
data = mde.extract(body)
self.assertEqual(data, expected)

0 comments on commit c410a1b

Please sign in to comment.