diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 54e9c167..a254a5c6 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -10,6 +10,8 @@ """ import collections +from functools import partial + try: from urlparse import urljoin except ImportError: @@ -77,7 +79,11 @@ def _extract_item(self, node, items_seen, base_url): item["id"] = itemid.strip() properties = collections.defaultdict(list) - # start with item references + for name, value in self._extract_properties( + node, items_seen=items_seen, base_url=base_url): + properties[name].append(value) + + # process item references refs = node.get('itemref', '').split() if refs: for refid in refs: @@ -85,10 +91,6 @@ def _extract_item(self, node, items_seen, base_url): node, refid, items_seen=items_seen, base_url=base_url): properties[name].append(value) - for name, value in self._extract_properties( - node, items_seen=items_seen, base_url=base_url): - properties[name].append(value) - props = [] for (name, values) in properties.items(): if not self.strict and len(values) == 1: @@ -119,10 +121,25 @@ def _extract_properties(self, node, items_seen, base_url): yield p, v def _extract_property_refs(self, node, refid, items_seen, base_url): - for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid): - for p, v in self._extract_property( - prop, items_seen=items_seen, base_url=base_url): + ref_node = node.xpath("id($refid)[1]", refid=refid) + if not ref_node: + return + ref_node = ref_node[0] + extract_fn = partial(self._extract_property, items_seen=items_seen, + base_url=base_url) + if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys(): + # An full item will be extracted from the node, no need to look + # for individual properties in childs + for p, v in extract_fn(ref_node): yield p, v + else: + base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]") + for prop in ref_node.xpath("descendant-or-self::*[@itemprop]"): + parent_scope = prop.xpath("ancestor::*[@itemscope][1]") + # Skip properties defined in a different scope than the ref_node + if parent_scope == base_parent_scope: + for p, v in extract_fn(prop): + yield p, v def _extract_property(self, node, items_seen, base_url): props = node.get("itemprop").split() diff --git a/tests/samples/schema.org/product-ref.html b/tests/samples/schema.org/product-ref.html new file mode 100644 index 00000000..4c7390dc --- /dev/null +++ b/tests/samples/schema.org/product-ref.html @@ -0,0 +1,55 @@ + + + + Photo gallery + + + +
+ ACME + Executive Anvil + Executive Anvil logo + Sleeker than ACME's Classic Anvil, the + Executive Anvil is perfect for the business traveler + looking for something to drop from a height. + + Product #: 925872 + + 4.4 stars, based on 89 + reviews + + + + Regular price: $179.99 + + $119.99 + (Sale ends ) + Available from: + Executive Objects + + Condition: Previously owned, + in excellent condition + In stock! Order now! + +
+
+ REFERENCED PRODUCT + +
+
+ REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT + +
+ + + \ No newline at end of file diff --git a/tests/samples/schema.org/product-ref.json b/tests/samples/schema.org/product-ref.json new file mode 100644 index 00000000..7c2bf8d6 --- /dev/null +++ b/tests/samples/schema.org/product-ref.json @@ -0,0 +1,69 @@ +[ + { + "type": "http://schema.org/Product", + "properties": { + "referenced_product": { + "type": "http://schema.org/Product", + "properties": { + "name": "REFERENCED PRODUCT", + "image": "img-ref.jpg" + } + }, + "prop3": "REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT", + "image": [ + "anvil_executive.jpg", + "img-2.jpg" + ], + "related_products": [ + { + "type": "http://schema.org/Product", + "properties": { + "name": "REL PROD 1", + "image": "rel-prod-1.jpg" + } + }, + { + "type": "http://schema.org/Product", + "properties": { + "name": "REL PROD 2", + "image": "rel-prod-2.jpg" + } + } + ], + "brand": "ACME", + "name": "Executive Anvil", + "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "mpn": "925872", + "aggregateRating": { + "type": "http://schema.org/AggregateRating", + "properties": { + "ratingValue": "4.4", + "reviewCount": "89" + } + }, + "offers": { + "type": "http://schema.org/Offer", + "properties": { + "priceCurrency": "USD", + "price": "119.99", + "priceValidUntil": "2020-11-05", + "seller": { + "type": "http://schema.org/Organization", + "properties": { + "name": "Executive Objects" + } + }, + "itemCondition": "http://schema.org/UsedCondition", + "availability": "http://schema.org/InStock" + } + } + } + }, + { + "type": "http://schema.org/Product", + "properties": { + "prop3": "REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT", + "image": "img-2.jpg" + } + } +] \ No newline at end of file diff --git a/tests/samples/w3c/microdata.5.3.json b/tests/samples/w3c/microdata.5.3.json index 3bae0123..8f064ede 100644 --- a/tests/samples/w3c/microdata.5.3.json +++ b/tests/samples/w3c/microdata.5.3.json @@ -2,6 +2,6 @@ {"properties": {"a": ["1", "2"], "b": ["test"]}}, {"properties": {"a": ["1", "2"], "b": ["test"]}}, {"properties": {"a": ["1", "2"], "b": ["test"]}}, - {"properties": {"a": ["1", "2"], "b": ["test"]}} + {"properties": {"a": ["2", "1"], "b": ["test"]}} ] diff --git a/tests/test_microdata.py b/tests/test_microdata.py index c7597a3a..19bbbdd3 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -171,3 +171,16 @@ def test_join_custom_url(self): mde = MicrodataExtractor() data = mde.extract(body, base_url='http://some-example.com') self.assertEqual(data, expected) + + +class TestItemref(unittest.TestCase): + + maxDiff = None + + def test_join_none(self): + body = get_testdata('schema.org', 'product-ref.html') + expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8')) + + mde = MicrodataExtractor() + data = mde.extract(body) + self.assertEqual(data, expected)