Merge pull request #81 from scrapinghub/add-option-to-return-html-nod…

…e-during-extraction [MRG+1] Add option to return html node during extraction
scrapinghub · Jun 6, 2018 · e9c7456 · e9c7456
2 parents 995f6b0 + fa06c83
commit e9c7456
Show file tree

Hide file tree

Showing 11 changed files with 228 additions and 23 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -31,6 +31,9 @@ v0.5.0 (TBD)
 * In ``w3microdata`` ``_extract_properties``, ``_extract_property_refs``, 
   ``_extract_property``, ``_extract_property_value`` and ``_extract_item``
   now need ``items_seen`` and ``url`` to be passed as arguments.
+* Add argument ``return_html_node`` to ``extract``, it allows to return HTML
+  node with the result of metadata extraction. It is supported only by
+  microdata syntax.
 
 Warning: backward-incompatible change:
 

diff --git a/README.rst b/README.rst
@@ -258,6 +258,42 @@ To do so set ``uniform=True`` when calling ``extract``, it's false by default fo
 
 NB rdfa structure is not uniformed yet
 
+Returning HTML node
++++++++++++++++++++
+
+It is also possible to get references to HTML node for every extracted metadata item.
+The feature is supported only by microdata syntax.
+
+To use that, just set the ``return_html_node`` option of ``extract`` method to ``True``.
+As the result, an additional key "nodeHtml" will be included in the result for every
+item. Each node is of ``lxml.etree.Element`` type: ::
+
+  >>> r = requests.get('http://www.rugpadcorner.com/shop/no-muv/')
+  >>> base_url = get_base_url(r.text, r.url)
+  >>> data = extruct.extract(r.text, base_url, syntaxes=['microdata'], return_html_node=True)
+  >>>
+  >>> pp.pprint(data)
+  { 'microdata': [ { 'htmlNode': <Element div at 0x7f10f8e6d3b8>,
+                     'properties': { 'description': 'KEEP RUGS FLAT ON CARPET!\n'
+                                                    'Not your thin sticky pad, '
+                                                    'No-Muv is truly the best!',
+                                     'image': ['', ''],
+                                     'name': ['No-Muv', 'No-Muv'],
+                                     'offers': [ { 'htmlNode': <Element div at 0x7f10f8e6d138>,
+                                                   'properties': { 'availability': 'http://schema.org/InStock',
+                                                                   'price': 'Price:  '
+                                                                            '$45'},
+                                                   'type': 'http://schema.org/Offer'},
+                                                 { 'htmlNode': <Element div at 0x7f10f8e60f48>,
+                                                   'properties': { 'availability': 'http://schema.org/InStock',
+                                                                   'price': '(Select '
+                                                                            'Size/Shape '
+                                                                            'for '
+                                                                            'Pricing)'},
+                                                   'type': 'http://schema.org/Offer'}],
+                                     'ratingValue': ['5.00', '5.00']},
+                     'type': 'http://schema.org/Product'}]}
+
 Single extractors
 -----------------
 

diff --git a/extruct/_extruct.py b/extruct/_extruct.py
@@ -20,6 +20,7 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
             syntaxes=SYNTAXES,
             errors='strict',
             uniform=False,
+            return_html_node=False,
             schema_context='http://schema.org',
             **kwargs):
     """htmlstring: string with valid html document;
@@ -34,6 +35,10 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
                  '@type': 'example_type',
                  /* All other the properties in keys here */
                  }
+       return_html_node: if True, it includes into the result a HTML node of
+                         respective embedded metadata under 'htmlNode' key.
+                         The feature is supported only by microdata syntax.
+                         Each node is of `lxml.etree.Element` type.
        schema_context: schema's context for current page"""
     if base_url is None and 'url' in kwargs:
         warnings.warn('"url" argument is deprecated, please use "base_url"',
@@ -51,7 +56,7 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
     tree = fromstring(htmlstring, parser=domparser)
     processors = []
     if 'microdata' in syntaxes:
-        processors.append(('microdata', MicrodataExtractor().extract_items, tree))
+        processors.append(('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree))
     if 'json-ld' in syntaxes:
         processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
     if 'opengraph' in syntaxes:

diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py
@@ -32,10 +32,11 @@ class LxmlMicrodataExtractor(object):
                                        + count(ancestor::*[@itemscope])
                                        + 1""")
 
-    def __init__(self, nested=True, strict=False, add_text_content=False):
+    def __init__(self, nested=True, strict=False, add_text_content=False, add_html_node=False):
         self.nested = nested
         self.strict = strict
         self.add_text_content = add_text_content
+        self.add_html_node = add_html_node
 
     def get_docid(self, node):
         return int(self._xp_item_docid(node))
@@ -101,11 +102,13 @@ def _extract_item(self, node, items_seen, base_url):
             item["value"] = self._extract_property_value(
                 node, force=True, items_seen=items_seen, base_url=base_url)
 
-        # not in the specs, but can be handy
+        # below are not in the specs, but can be handy
         if self.add_text_content:
             textContent = self._extract_textContent(node)
             if textContent:
                 item["textContent"] = textContent
+        if self.add_html_node:
+            item["htmlNode"] = node
 
         return item
 

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -5,6 +5,7 @@
 
 tests_datadir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'samples')
 
+
 def get_testdata(*paths):
     """Return test data"""
     path = os.path.join(tests_datadir, *paths)
@@ -14,3 +15,17 @@ def get_testdata(*paths):
 
 def jsonize_dict(d):
     return json.loads(json.dumps(d))
+
+
+def replace_node_ref_with_node_id(item):
+    if isinstance(item, list):
+        for i in item:
+            replace_node_ref_with_node_id(i)
+    if isinstance(item, dict):
+        for key in list(item):
+            val = item[key]
+            if key == "htmlNode":
+                item["_nodeId_"] = val.get("id")
+                del item[key]
+            else:
+                replace_node_ref_with_node_id(val)
diff --git a/tests/samples/schema.org/CreativeWork.001.html b/tests/samples/schema.org/CreativeWork.001.html
@@ -59,19 +59,19 @@
 }
 </script>
 <div>
-<div itemscope itemtype="http://schema.org/Book" itemid="http://worldcat.org/entity/work/id/2292573321">
+<div id="book1" itemscope itemtype="http://schema.org/Book" itemid="http://worldcat.org/entity/work/id/2292573321">
         <h1><span itemprop="name">Rouge et le noir</span></h1>
-    <div>Author: <span property="itemprop" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/17823">Stendhal</span></div>
+    <div>Author: <span id="author1" property="itemprop" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/17823">Stendhal</span></div>
         <div>Language: <span itemprop="inLanguage" content="fr">French</span></div>
-        <div>Has Translation: <span itemprop="workTranslation" itemscope itemtype="http://schema.org/CreativeWork" itemid="http://worldcat.org/entity/work/id/460647">Red and Black : A New Translation, Backgrounds and Sources, Criticism</span></div>
+        <div>Has Translation: <span id="creativeWork1" itemprop="workTranslation" itemscope itemtype="http://schema.org/CreativeWork" itemid="http://worldcat.org/entity/work/id/460647">Red and Black : A New Translation, Backgrounds and Sources, Criticism</span></div>
 </div>
-<div itemscope itemtype="http://schema.org/Book" itemid="http://worldcat.org/entity/work/id/460647">
+<div id="book2" itemscope itemtype="http://schema.org/Book" itemid="http://worldcat.org/entity/work/id/460647">
     <h1><span itemprop="name">Red and Black : A New Translation, Backgrounds and Sources, Criticism</span></h1>
-    <div>Author: <span itemprop="author" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/17823">Stendhal</span></div>
+    <div>Author: <span id="author2" itemprop="author" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/17823">Stendhal</span></div>
         <div>Language: <span itemprop="inLanguage" content="en">English</span></div>
         <div>Subject: <span itemprop="about">Psychological fiction, French</span></div>
-        <div>Translation of: <span itemprop="translationOfWork" itemscope itemtype="http://schema.org/CreativeWork" itemid="http://worldcat.org/entity/work/id/2292573321">Rouge et le noir</span></div>
-        <div>Translator: <span property="itemprop" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/8453420">Robert Martin Adams</span></div>
+        <div>Translation of: <span id="creativeWork2" itemprop="translationOfWork" itemscope itemtype="http://schema.org/CreativeWork" itemid="http://worldcat.org/entity/work/id/2292573321">Rouge et le noir</span></div>
+        <div>Translator: <span id="translator2" property="itemprop" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/8453420">Robert Martin Adams</span></div>
 </div>
 </div>
 </body>

diff --git a/tests/samples/schema.org/CreativeWork_flat_with_node_id.001.json b/tests/samples/schema.org/CreativeWork_flat_with_node_id.001.json
@@ -0,0 +1,107 @@
+{
+  "microdata": [
+    {
+      "_nodeId_": "book1",
+      "@type": "Book",
+      "@context": "http://schema.org",
+      "id": "http://worldcat.org/entity/work/id/2292573321",
+      "name": "Rouge et le noir",
+      "inLanguage": "fr",
+      "workTranslation": {
+        "_nodeId_": "creativeWork1",
+        "@type": "CreativeWork",
+        "id": "http://worldcat.org/entity/work/id/460647",
+        "value": "Red and Black : A New Translation, Backgrounds and Sources, Criticism"
+        }
+      },
+    {
+      "_nodeId_": "author1",
+      "@type": "Person",
+      "@context": "http://schema.org",
+      "id": "http://viaf.org/viaf/17823",
+      "value": "Stendhal"
+    },
+    {
+      "_nodeId_": "book2",
+      "@type": "Book",
+      "@context": "http://schema.org",
+      "id": "http://worldcat.org/entity/work/id/460647",
+      "name": "Red and Black : A New Translation, Backgrounds and Sources, Criticism",
+      "author": {
+        "_nodeId_": "author2",
+        "@type": "Person",
+        "id": "http://viaf.org/viaf/17823",
+        "value": "Stendhal"
+      },
+      "inLanguage": "en",
+      "about": "Psychological fiction, French",
+      "translationOfWork": {
+        "_nodeId_": "creativeWork2",
+        "@type": "CreativeWork",
+        "id": "http://worldcat.org/entity/work/id/2292573321",
+        "value": "Rouge et le noir"
+        }
+      },
+    {
+      "_nodeId_": "translator2",
+      "@type": "Person",
+      "@context": "http://schema.org",
+      "id": "http://viaf.org/viaf/8453420",
+      "value": "Robert Martin Adams"
+    }
+  ],
+  "json-ld": [
+    {
+      "@context": "http://schema.org",
+      "@type": "WebPage",
+      "breadcrumb": "Books > Literature & Fiction > Classics",
+      "mainEntity": {
+        "@type": "Book",
+        "author": "/author/jd_salinger.html",
+        "bookFormat": "http://schema.org/Paperback",
+        "datePublished": "1991-05-01",
+        "image": "catcher-in-the-rye-book-cover.jpg",
+        "inLanguage": "English",
+        "isbn": "0316769487",
+        "name": "The Catcher in the Rye",
+        "numberOfPages": "224",
+        "offers": {
+          "@type": "Offer",
+          "availability": "http://schema.org/InStock",
+          "price": "6.99",
+          "priceCurrency": "USD"
+        },
+        "publisher": "Little, Brown, and Company",
+        "aggregateRating": {
+          "@type": "AggregateRating",
+          "ratingValue": "4",
+          "reviewCount": "3077"
+        },
+        "review": [
+          {
+            "@type": "Review",
+            "author": "John Doe",
+            "datePublished": "2006-05-04",
+            "name": "A masterpiece of literature",
+            "reviewBody": "I really enjoyed this book. It captures the essential challenge people face as they try make sense of their lives and grow to adulthood.",
+            "reviewRating": {
+              "@type": "Rating",
+              "ratingValue": "5"
+            }
+          },
+          {
+            "@type": "Review",
+            "author": "Bob Smith",
+            "datePublished": "2006-06-15",
+            "name": "A good read.",
+            "reviewBody": "Catcher in the Rye is a fun book. It's a good book to read.",
+            "reviewRating": "4"
+          }
+        ]
+      }
+    }
+  ],
+  "opengraph": [],
+  "microformat": [],
+  "rdfa": []
+}
diff --git a/tests/samples/schema.org/product.html b/tests/samples/schema.org/product.html
@@ -5,7 +5,7 @@
  </head>
  <body>
 
-  <div itemscope itemtype="http://schema.org/Product">
+  <div id="product" itemscope itemtype="http://schema.org/Product">
     <span itemprop="brand">ACME</span>
     <span itemprop="name">Executive Anvil</span>
     <img itemprop="image" src=" anvil_executive.jpg
@@ -15,18 +15,18 @@
       looking for something to drop from a height.
     </span>
     Product #: <span itemprop="mpn">925872</span>
-    <span itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
+    <span id="aggregateRating" itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
       <span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
         </span> reviews
     </span>
 
-    <span itemprop="offers" itemscope itemtype="http://schema.org/Offer">
+    <span id="offer" itemprop="offers" itemscope itemtype="http://schema.org/Offer">
       Regular price: $179.99
       <meta itemprop="priceCurrency" content="USD" />
       $<span itemprop="price">119.99 </span>
       (Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
         5 November!</time>)
-      Available from: <span itemprop="seller" itemscope itemtype="http://schema.org/Organization">
+      Available from: <span id="organization" itemprop="seller" itemscope itemtype="http://schema.org/Organization">
                         <span itemprop="name">Executive Objects</span>
                       </span>
       Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,

diff --git a/tests/samples/schema.org/product_custom_url_and_node_id.json b/tests/samples/schema.org/product_custom_url_and_node_id.json
@@ -0,0 +1,23 @@
+[{"type": "http://schema.org/Product",
+  "_nodeId_": "product",
+  "properties": {"brand": "ACME",
+                 "name": "Executive Anvil",
+                 "image": "http://some-example.com/anvil_executive.jpg",
+                 "description": "Sleeker than ACME's Classic Anvil, the\n      Executive Anvil is perfect for the business traveler\n      looking for something to drop from a height.",
+                 "mpn": "925872",
+                 "aggregateRating": {"type": "http://schema.org/AggregateRating",
+                                     "_nodeId_": "aggregateRating",
+                                     "properties": {"ratingValue": "4.4",
+                                     "reviewCount": "89"}},
+                 "offers": {"type": "http://schema.org/Offer",
+                            "_nodeId_": "offer",
+                            "properties": {"priceCurrency": "USD",
+                                           "price": "119.99",
+                                           "priceValidUntil": "2020-11-05",
+                                           "seller": {"type": "http://schema.org/Organization",
+                                                      "_nodeId_": "organization",
+                                                      "properties":{"name": "Executive Objects"}},
+                            "itemCondition": "http://schema.org/UsedCondition",
+                            "availability": "http://schema.org/InStock"}}
+                  }
+  }]
diff --git a/tests/test_extruct.py b/tests/test_extruct.py
@@ -3,7 +3,7 @@
 import unittest
 
 import extruct
-from tests import get_testdata, jsonize_dict
+from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id
 
 
 class TestGeneric(unittest.TestCase):
@@ -17,25 +17,32 @@ def test_all(self):
         self.assertEqual(jsonize_dict(data), expected)
 
     def test_microdata_custom_url(self):
-        body, expected = self._microdata_custom_url()
+        body, expected = self._microdata_custom_url('product_custom_url.json')
         data = extruct.extract(body, base_url='http://some-example.com',
                                syntaxes=['microdata'])
         self.assertEqual(data, expected)
 
+    def test_microdata_with_returning_node(self):
+        body, expected = self._microdata_custom_url('product_custom_url_and_node_id.json')
+        data = extruct.extract(body, base_url='http://some-example.com',
+                               syntaxes=['microdata'], return_html_node=True)
+        replace_node_ref_with_node_id(data)
+        self.assertEqual(data, expected)
+
     def test_deprecated_url(self):
-        body, expected = self._microdata_custom_url()
+        body, expected = self._microdata_custom_url('product_custom_url.json')
         data = extruct.extract(body, url='http://some-example.com',
                                syntaxes=['microdata'])
         self.assertEqual(data, expected)
 
     def test_extra_kwargs(self):
-        body, expected = self._microdata_custom_url()
+        body, expected = self._microdata_custom_url('product_custom_url.json')
         with self.assertRaises(TypeError):
             extruct.extract(body, foo='bar')
 
-    def _microdata_custom_url(self):
+    def _microdata_custom_url(self, test_file):
         body = get_testdata('schema.org', 'product.html')
         expected = {'microdata': json.loads(
-            get_testdata('schema.org', 'product_custom_url.json')
+            get_testdata('schema.org', test_file)
             .decode('UTF-8'))}
         return body, expected