Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 25 additions & 8 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
"""

import collections
from functools import partial

try:
from urlparse import urljoin
except ImportError:
Expand Down Expand Up @@ -77,18 +79,18 @@ def _extract_item(self, node, items_seen, base_url):
item["id"] = itemid.strip()

properties = collections.defaultdict(list)
# start with item references
for name, value in self._extract_properties(
node, items_seen=items_seen, base_url=base_url):
properties[name].append(value)

# process item references
refs = node.get('itemref', '').split()
if refs:
for refid in refs:
for name, value in self._extract_property_refs(
node, refid, items_seen=items_seen, base_url=base_url):
properties[name].append(value)

for name, value in self._extract_properties(
node, items_seen=items_seen, base_url=base_url):
properties[name].append(value)

props = []
for (name, values) in properties.items():
if not self.strict and len(values) == 1:
Expand Down Expand Up @@ -119,10 +121,25 @@ def _extract_properties(self, node, items_seen, base_url):
yield p, v

def _extract_property_refs(self, node, refid, items_seen, base_url):
for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid):
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url):
ref_node = node.xpath("id($refid)[1]", refid=refid)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I understand, if there were several elements with the same id, they were all extracted, but now just the first one is extracted? I realise that there supposed to be only one element with each id so this seems fine.

Copy link
Contributor Author

@ivanprado ivanprado Feb 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see... I went to standard: https://www.w3.org/TR/html401/struct/global.html#h-7.5.2

id = name [CS]
This attribute assigns a name to an element. This name must be unique in a document.

So I think is safe to expect just one.

if not ref_node:
return
ref_node = ref_node[0]
extract_fn = partial(self._extract_property, items_seen=items_seen,
base_url=base_url)
if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys():
# An full item will be extracted from the node, no need to look
# for individual properties in childs
for p, v in extract_fn(ref_node):
yield p, v
else:
base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]")
for prop in ref_node.xpath("descendant-or-self::*[@itemprop]"):
parent_scope = prop.xpath("ancestor::*[@itemscope][1]")
# Skip properties defined in a different scope than the ref_node
if parent_scope == base_parent_scope:
for p, v in extract_fn(prop):
yield p, v

def _extract_property(self, node, items_seen, base_url):
props = node.get("itemprop").split()
Expand Down
55 changes: 55 additions & 0 deletions tests/samples/schema.org/product-ref.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
<!DOCTYPE HTML>
<html>
<head>
<title>Photo gallery</title>
</head>
<body>

<div id="product" itemscope itemtype="http://schema.org/Product" itemref="referenced-product more-properties related_products non-existing-ref">
<span itemprop="brand">ACME</span>
<span itemprop="name">Executive Anvil</span>
<img itemprop="image" src=" anvil_executive.jpg" alt="Executive Anvil logo"/>
<span itemprop="description">Sleeker than ACME's Classic Anvil, the
Executive Anvil is perfect for the business traveler
looking for something to drop from a height.
</span>
Product #: <span itemprop="mpn">925872</span>
<span id="aggregateRating" itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
</span> reviews
</span>

<span id="offer" itemprop="offers" itemscope itemtype="http://schema.org/Offer">
Regular price: $179.99
<meta itemprop="priceCurrency" content="USD" />
$<span itemprop="price">119.99 </span>
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
5 November!</time>)
Available from: <span id="organization" itemprop="seller" itemscope itemtype="http://schema.org/Organization">
<span itemprop="name">Executive Objects</span>
</span>
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,
in excellent condition
<link itemprop="availability" href=" http://schema.org/InStock"/>In stock! Order now!
</span>
</div>
<div id="referenced-product" itemscope itemtype="http://schema.org/Product" itemprop="referenced_product">
<span itemprop="name">REFERENCED PRODUCT</span>
<img itemprop="image" src="img-ref.jpg">
</div>
<div id="more-properties" itemscope itemtype="http://schema.org/Product">
<span itemprop="prop3">REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT</span>
<img itemprop="image" src="img-2.jpg">
</div>
<div id="related_products">
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
<span itemprop="name">REL PROD 1</span>
<img itemprop="image" src="rel-prod-1.jpg">
</div>
<div itemscope itemtype="http://schema.org/Product" itemprop="related_products">
<span itemprop="name">REL PROD 2</span>
<img itemprop="image" src="rel-prod-2.jpg">
</div>
</div>
</body>
</html>
69 changes: 69 additions & 0 deletions tests/samples/schema.org/product-ref.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
[
{
"type": "http://schema.org/Product",
"properties": {
"referenced_product": {
"type": "http://schema.org/Product",
"properties": {
"name": "REFERENCED PRODUCT",
"image": "img-ref.jpg"
}
},
"prop3": "REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT",
"image": [
"anvil_executive.jpg",
"img-2.jpg"
],
"related_products": [
{
"type": "http://schema.org/Product",
"properties": {
"name": "REL PROD 1",
"image": "rel-prod-1.jpg"
}
},
{
"type": "http://schema.org/Product",
"properties": {
"name": "REL PROD 2",
"image": "rel-prod-2.jpg"
}
}
],
"brand": "ACME",
"name": "Executive Anvil",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {
"type": "http://schema.org/AggregateRating",
"properties": {
"ratingValue": "4.4",
"reviewCount": "89"
}
},
"offers": {
"type": "http://schema.org/Offer",
"properties": {
"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "2020-11-05",
"seller": {
"type": "http://schema.org/Organization",
"properties": {
"name": "Executive Objects"
}
},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"
}
}
}
},
{
"type": "http://schema.org/Product",
"properties": {
"prop3": "REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry maybe I'm reading the schema incorrectly, but why do we have a separate product with its properties duplicated also in the product above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The product with id more-properties does not have attribute itemprop. Because of that, when referenced only inner attributes are included. As the product has not been extracted when referencing, it is finally extracted as an individual Product.

Regarding if that should be behaviour or not, I'm not completelly sure. This is the standard definition of itemrefhttps://www.w3.org/TR/microdata/#names:-the-itemprop-attribute but is not very complete.

There is an algorithm definition here: https://www.w3.org/TR/microdata/#associating-names-with-items but I didn't go so deep for this patch as it seems it is not the same approach followed by extruct.

I have just tested this html in Google Dev Tools and it also extracts the product isolated, but it is not including prop3 into the first product.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, thanks for checking this @ivanprado . I think the current approach makes sense for extruct.

"image": "img-2.jpg"
}
}
]
2 changes: 1 addition & 1 deletion tests/samples/w3c/microdata.5.3.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
{"properties": {"a": ["1", "2"], "b": ["test"]}},
{"properties": {"a": ["1", "2"], "b": ["test"]}},
{"properties": {"a": ["1", "2"], "b": ["test"]}},
{"properties": {"a": ["1", "2"], "b": ["test"]}}
{"properties": {"a": ["2", "1"], "b": ["test"]}}
]

13 changes: 13 additions & 0 deletions tests/test_microdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,16 @@ def test_join_custom_url(self):
mde = MicrodataExtractor()
data = mde.extract(body, base_url='http://some-example.com')
self.assertEqual(data, expected)


class TestItemref(unittest.TestCase):

maxDiff = None

def test_join_none(self):
body = get_testdata('schema.org', 'product-ref.html')
expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))

mde = MicrodataExtractor()
data = mde.extract(body)
self.assertEqual(data, expected)