From 258664f18b03f63fa69a90e6510cc90f021540a4 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 6 Feb 2019 14:25:26 +0000 Subject: [PATCH 1/4] Avoid including itemprop from child itemscopes when using itemref --- extruct/w3cmicrodata.py | 15 ++++- tests/samples/schema.org/product-ref.html | 55 ++++++++++++++++++ tests/samples/schema.org/product-ref.json | 71 +++++++++++++++++++++++ tests/test_microdata.py | 13 +++++ 4 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 tests/samples/schema.org/product-ref.html create mode 100644 tests/samples/schema.org/product-ref.json diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 54e9c167..414f9333 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -119,10 +119,21 @@ def _extract_properties(self, node, items_seen, base_url): yield p, v def _extract_property_refs(self, node, refid, items_seen, base_url): - for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid): + ref_node = node.xpath("id($refid)[1]", refid=refid) + if not ref_node: + return + ref_node = ref_node[0] + base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]") + if 'itemprop' in ref_node.keys(): for p, v in self._extract_property( - prop, items_seen=items_seen, base_url=base_url): + ref_node, items_seen=items_seen, base_url=base_url): yield p, v + for prop in ref_node.xpath("descendant::*[@itemprop]"): + parent_scope = prop.xpath("ancestor::*[@itemscope][1]") + if parent_scope == base_parent_scope: + for p, v in self._extract_property( + prop, items_seen=items_seen, base_url=base_url): + yield p, v def _extract_property(self, node, items_seen, base_url): props = node.get("itemprop").split() diff --git a/tests/samples/schema.org/product-ref.html b/tests/samples/schema.org/product-ref.html new file mode 100644 index 00000000..bd0d32c2 --- /dev/null +++ b/tests/samples/schema.org/product-ref.html @@ -0,0 +1,55 @@ + + + + Photo gallery + + + +
+ ACME + Executive Anvil + Executive Anvil logo + Sleeker than ACME's Classic Anvil, the + Executive Anvil is perfect for the business traveler + looking for something to drop from a height. + + Product #: 925872 + + 4.4 stars, based on 89 + reviews + + + + Regular price: $179.99 + + $119.99 + (Sale ends ) + Available from: + Executive Objects + + Condition: Previously owned, + in excellent condition + In stock! Order now! + +
+
+ PROP 2 + +
+
+ PROP 3 + +
+ + + \ No newline at end of file diff --git a/tests/samples/schema.org/product-ref.json b/tests/samples/schema.org/product-ref.json new file mode 100644 index 00000000..70818fd3 --- /dev/null +++ b/tests/samples/schema.org/product-ref.json @@ -0,0 +1,71 @@ +[ + { + "type": "http://schema.org/Product", + "properties": { + "referenced_product": { + "type": "http://schema.org/Product", + "properties": { + "prop2": "PROP 2", + "image": "img-2.jpg" + } + }, + "prop2": "PROP 2", + "image": [ + "img-2.jpg", + "img-3.jpg", + "anvil_executive.jpg" + ], + "prop3": "PROP 3", + "related_products": [ + { + "type": "http://schema.org/Product", + "properties": { + "name": "REL PROD 1", + "image": "rel-prod-1.jpg" + } + }, + { + "type": "http://schema.org/Product", + "properties": { + "name": "REL PROD 2", + "image": "rel-prod-2.jpg" + } + } + ], + "brand": "ACME", + "name": "Executive Anvil", + "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "mpn": "925872", + "aggregateRating": { + "type": "http://schema.org/AggregateRating", + "properties": { + "ratingValue": "4.4", + "reviewCount": "89" + } + }, + "offers": { + "type": "http://schema.org/Offer", + "properties": { + "priceCurrency": "USD", + "price": "119.99", + "priceValidUntil": "2020-11-05", + "seller": { + "type": "http://schema.org/Organization", + "properties": { + "name": "Executive Objects" + } + }, + "itemCondition": "http://schema.org/UsedCondition", + "availability": "http://schema.org/InStock" + } + } + } + }, + { + "type": "http://schema.org/Product", + "properties": { + "prop3": "PROP 3", + "image": "img-3.jpg" + } + } +] \ No newline at end of file diff --git a/tests/test_microdata.py b/tests/test_microdata.py index c7597a3a..19bbbdd3 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -171,3 +171,16 @@ def test_join_custom_url(self): mde = MicrodataExtractor() data = mde.extract(body, base_url='http://some-example.com') self.assertEqual(data, expected) + + +class TestItemref(unittest.TestCase): + + maxDiff = None + + def test_join_none(self): + body = get_testdata('schema.org', 'product-ref.html') + expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8')) + + mde = MicrodataExtractor() + data = mde.extract(body) + self.assertEqual(data, expected) From 4171aa23fa4d125224f7e9737bf8fd7ff966d143 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 6 Feb 2019 16:21:30 +0000 Subject: [PATCH 2/4] Fix referencing with itemref --- extruct/w3cmicrodata.py | 26 +++-- tests/samples/schema.org/product-ref.html | 12 +-- tests/samples/schema.org/product-ref.json | 124 +++++++++++----------- tests/test_microdata.py | 1 + 4 files changed, 83 insertions(+), 80 deletions(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 414f9333..604ecce2 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -10,6 +10,8 @@ """ import collections +from functools import partial + try: from urlparse import urljoin except ImportError: @@ -123,17 +125,19 @@ def _extract_property_refs(self, node, refid, items_seen, base_url): if not ref_node: return ref_node = ref_node[0] - base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]") - if 'itemprop' in ref_node.keys(): - for p, v in self._extract_property( - ref_node, items_seen=items_seen, base_url=base_url): - yield p, v - for prop in ref_node.xpath("descendant::*[@itemprop]"): - parent_scope = prop.xpath("ancestor::*[@itemscope][1]") - if parent_scope == base_parent_scope: - for p, v in self._extract_property( - prop, items_seen=items_seen, base_url=base_url): - yield p, v + extract_fn = partial(self._extract_property, items_seen=items_seen, + base_url=base_url) + if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys(): + # An full item will be extracted from the node, no need to look + # for individual properties in childs + yield from extract_fn(ref_node) + else: + base_parent_scope = ref_node.xpath("ancestor-or-self::*[@itemscope][1]") + for prop in ref_node.xpath("descendant-or-self::*[@itemprop]"): + parent_scope = prop.xpath("ancestor::*[@itemscope][1]") + # Skip properties defined in a different scope than the ref_node + if parent_scope == base_parent_scope: + yield from extract_fn(prop) def _extract_property(self, node, items_seen, base_url): props = node.get("itemprop").split() diff --git a/tests/samples/schema.org/product-ref.html b/tests/samples/schema.org/product-ref.html index bd0d32c2..4c7390dc 100644 --- a/tests/samples/schema.org/product-ref.html +++ b/tests/samples/schema.org/product-ref.html @@ -5,7 +5,7 @@ -
+
ACME Executive Anvil Executive Anvil logo @@ -33,13 +33,13 @@ In stock! Order now!
-
- PROP 2 - +
+ REFERENCED PRODUCT +
- PROP 3 - + REFERENCED TO INCLUDE PROPERTIES AND ALSO INDIVIDUAL PRODUCT +