Merge pull request #79 from scrapinghub/base-url

Use base_url instead of url
scrapinghub · May 15, 2018 · e0d2d22 · e0d2d22
2 parents f76b918 + 485dd26
commit e0d2d22
Show file tree

Hide file tree

Showing 11 changed files with 119 additions and 73 deletions.
diff --git a/README.rst b/README.rst
@@ -56,8 +56,9 @@ Usage
 All-in-one extraction
 +++++++++++++++++++++
 
-The simplest example how to use extruct is to call ``extruct.extract(htmlstring, url)``
-with some HTML string and a URL.
+The simplest example how to use extruct is to call
+``extruct.extract(htmlstring, base_url=base_url)``
+with some HTML string and an optional base URL.
 
 Let's try this on a webpage that uses all the syntaxes supported (RDFa with `ogp`_).
 
@@ -66,10 +67,12 @@ First fetch the HTML using python-requests and then feed the response body to ``
   >>> import extruct
   >>> import requests
   >>> import pprint
+  >>> from w3lib.html import get_base_url
   >>>
   >>> pp = pprint.PrettyPrinter(indent=2)
   >>> r = requests.get('https://www.optimizesmart.com/how-to-use-open-graph-protocol/')
-  >>> data = extruct.extract(r.text, r.url)
+  >>> base_url = get_base_url(r.text, r.url)
+  >>> data = extruct.extract(r.text, base_url=base_url)
   >>>
   >>> pp.pprint(data)
   { 'json-ld': [ { '@context': 'https://schema.org',
@@ -167,7 +170,8 @@ Select syntaxes
 It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned::
 
   >>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
-  >>> data = extruct.extract(r.text, r.url, syntaxes=['microdata', 'opengraph', 'rdfa'])
+  >>> base_url = get_base_url(r.text, r.url)
+  >>> data = extruct.extract(r.text, base_url, syntaxes=['microdata', 'opengraph', 'rdfa'])
   >>>
   >>> pp.pprint(data)
   { 'microdata': [],
@@ -217,7 +221,8 @@ Another option is to uniform the output of microformat, opengraph, microdata and
 To do so set ``uniform=True`` when calling ``extract``, it's false by default for backward compatibility. Here the same example as before but with uniform set to True: ::
 
   >>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
-  >>> data = extruct.extract(r.text, r.url, syntaxes=['microdata', 'opengraph', 'rdfa'], uniform=True)
+  >>> base_url = get_base_url(r.text, r.url)
+  >>> data = extruct.extract(r.text, base_url, syntaxes=['microdata', 'opengraph', 'rdfa'], uniform=True)
   >>>
   >>> pp.pprint(data)
   { 'microdata': [],
@@ -387,7 +392,7 @@ RDFa extraction (experimental)
   ... """
   >>>
   >>> rdfae = RDFaExtractor()
-  >>> pp.pprint(rdfae.extract(html, url='http://www.example.com/index.html'))
+  >>> pp.pprint(rdfae.extract(html, base_url='http://www.example.com/index.html'))
   [{'@id': 'http://www.example.com/alice/posts/trouble_with_bob',
     '@type': ['http://schema.org/BlogPosting'],
     'http://purl.org/dc/terms/creator': [{'@id': 'http://www.example.com/index.html#me'}],
@@ -441,7 +446,7 @@ Open Graph extraction
   ... </html>"""
   >>>
   >>> opengraphe = OpenGraphExtractor()
-  >>> pp.pprint(opengraphe.extract(html, url='http://www.example.com/index.html'))
+  >>> pp.pprint(opengraphe.extract(html))
   [{"namespace": {
         "og": "http://ogp.me/ns#"
     },

diff --git a/extruct/_extruct.py b/extruct/_extruct.py
@@ -1,4 +1,5 @@
 import logging
+import warnings
 
 from lxml.html import fromstring
 
@@ -15,13 +16,14 @@
 SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
 
 
-def extract(htmlstring, url=None, encoding="UTF-8",
+def extract(htmlstring, base_url=None, encoding="UTF-8",
             syntaxes=SYNTAXES,
             errors='strict',
             uniform=False,
-            schema_context='http://schema.org'):
+            schema_context='http://schema.org',
+            **kwargs):
     """htmlstring: string with valid html document;
-       url: url of the html documents
+       base_url: base url of the html document
        encoding: encoding of the html document
        syntaxes: list of syntaxes to extract, default SYNTAXES
        errors: set to 'log' to log the exceptions, 'ignore' to ignore them
@@ -33,6 +35,12 @@ def extract(htmlstring, url=None, encoding="UTF-8",
                  /* All other the properties in keys here */
                  }
        schema_context: schema's context for current page"""
+    if base_url is None and 'url' in kwargs:
+        warnings.warn('"url" argument is deprecated, please use "base_url"',
+                      DeprecationWarning)
+        base_url = kwargs.pop('url')
+    if kwargs:
+        raise TypeError('Unexpected keyword arguments')
     if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
         raise ValueError("syntaxes must be a list with any or all (default) of"
                          "these values: {}".format(SYNTAXES))
@@ -43,24 +51,22 @@ def extract(htmlstring, url=None, encoding="UTF-8",
     tree = fromstring(htmlstring, parser=domparser)
     processors = []
     if 'microdata' in syntaxes:
-        processors.append(('microdata', MicrodataExtractor().extract_items))
+        processors.append(('microdata', MicrodataExtractor().extract_items, tree))
     if 'json-ld' in syntaxes:
-        processors.append(('json-ld', JsonLdExtractor().extract_items))
+        processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
     if 'opengraph' in syntaxes:
-        processors.append(('opengraph', OpenGraphExtractor().extract_items))
+        processors.append(('opengraph', OpenGraphExtractor().extract_items, tree))
     if 'microformat' in syntaxes:
-        processors.append(('microformat', MicroformatExtractor().extract_items))
+        processors.append(('microformat', MicroformatExtractor().extract_items, htmlstring))
     if 'rdfa' in syntaxes:
-        processors.append(('rdfa', RDFaExtractor().extract_items))
+        processors.append(('rdfa', RDFaExtractor().extract_items, tree))
     output = {}
-    for label, extract in processors:
+    for label, extract, document in processors:
         try:
-            output[label] = [obj for obj in extract(document=tree,
-                                                    url=url,
-                                                    html=htmlstring)]
+            output[label] = list(extract(document, base_url=base_url))
         except Exception:
             if errors == 'log':
-                logger.exception("Failed to extract {} from {}".format(label, url))
+                logger.exception('Failed to extract {}'.format(label))
             if errors == 'ignore':
                 pass
             if errors == 'strict':

diff --git a/extruct/jsonld.py b/extruct/jsonld.py
@@ -12,15 +12,16 @@
 
 HTML_OR_JS_COMMENTLINE = re.compile('^\s*(//.*|<!--.*-->)')
 
+
 class JsonLdExtractor(object):
     _xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]')
 
-    def extract(self, htmlstring, url=None, encoding="UTF-8"):
+    def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
         parser = lxml.html.HTMLParser(encoding=encoding)
         lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
-        return self.extract_items(lxmldoc)
+        return self.extract_items(lxmldoc, base_url=base_url)
 
-    def extract_items(self, document, *args, **kwargs):
+    def extract_items(self, document, base_url=None):
         return [item for items in map(self._extract_items,
                                       self._xp_jsonld(document))
                      for item in items

diff --git a/extruct/microformat.py b/extruct/microformat.py
@@ -1,10 +1,11 @@
 import mf2py
 
+
 class MicroformatExtractor(object):
 
-    def extract(self, htmlstring, url=None, encoding='UTF-8'):
-        return list(self.extract_items(htmlstring, url=url))
+    def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
+        return list(self.extract_items(htmlstring, base_url=base_url))
 
-    def extract_items(self, html, url, document=None):
-        for obj in mf2py.parse(html, html_parser="lxml", url=url)['items']:
+    def extract_items(self, html, base_url=None):
+        for obj in mf2py.parse(html, html_parser="lxml", url=base_url)['items']:
             yield obj
diff --git a/extruct/opengraph.py b/extruct/opengraph.py
@@ -5,12 +5,12 @@
 class OpenGraphExtractor(object):
     """OpenGraph extractor following extruct API."""
 
-    def extract(self, htmlstring, url=None, encoding='UTF-8'):
+    def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
         parser = lxml.html.HTMLParser(encoding=encoding)
         doc = lxml.html.fromstring(htmlstring, parser=parser)
-        return list(self.extract_items(doc))
+        return list(self.extract_items(doc, base_url=base_url))
 
-    def extract_items(self, document, *args, **kwargs):
+    def extract_items(self, document, base_url=None):
         # OpenGraph defines a web page as a single rich object.
         # TODO: Handle known opengraph namespaces.
         for head in document.xpath('//head'):

diff --git a/extruct/rdfa.py b/extruct/rdfa.py
@@ -29,14 +29,14 @@
 
 class RDFaExtractor(object):
 
-    def extract(self, htmlstring, url=None, encoding="UTF-8",
-            expanded=True):
+    def extract(self, htmlstring, base_url=None, encoding="UTF-8",
+                expanded=True):
 
         domparser = XmlDomHTMLParser(encoding=encoding)
         tree = fromstring(htmlstring, parser=domparser)
-        return self.extract_items(tree, url, expanded=expanded)
+        return self.extract_items(tree, base_url=base_url, expanded=expanded)
 
-    def extract_items(self, document, url, expanded=True, *args, **kwargs):
+    def extract_items(self, document, base_url=None, expanded=True):
         options = Options(output_processor_graph=True,
                           embedded_rdf=False,
                           space_preserve=True,
@@ -46,6 +46,6 @@ def extract_items(self, document, url, expanded=True, *args, **kwargs):
                           refresh_vocab_cache=False,
                           check_lite=False)
 
-        g = PyRdfa(options, base=url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
+        g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
         jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8')
         return json.loads(jsonld_string)
diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py
@@ -19,6 +19,7 @@
 import lxml.html
 from w3lib.html import strip_html5_whitespace
 
+
 class LxmlMicrodataExtractor(object):
     _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
     _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
@@ -39,26 +40,26 @@ def __init__(self, nested=True, strict=False, add_text_content=False):
     def get_docid(self, node):
         return int(self._xp_item_docid(node))
 
-    def extract(self, htmlstring, url=None, encoding="UTF-8"):
+    def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
         parser = lxml.html.HTMLParser(encoding=encoding)
         lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
-        return self.extract_items(lxmldoc, url)
+        return self.extract_items(lxmldoc, base_url)
 
-    def extract_items(self, document, url, *args, **kwargs):
-        self.url = url
-        self.items_seen = set()
-        return [item
-                for item in map(self.extract_item,
-                                self._xp_item(document))
-                if item]
+    def extract_items(self, document, base_url):
+        items_seen = set()
+        return [
+            item for item in (
+                self._extract_item(it, items_seen=items_seen, base_url=base_url)
+                for it in self._xp_item(document))
+            if item]
 
-    def extract_item(self, node):
+    def _extract_item(self, node, items_seen, base_url):
         itemid = self.get_docid(node)
 
         if self.nested:
-            if itemid in self.items_seen:
+            if itemid in items_seen:
                 return
-            self.items_seen.add(itemid)
+            items_seen.add(itemid)
 
         item = {}
         if not self.nested:
@@ -79,10 +80,12 @@ def extract_item(self, node):
         refs = node.get('itemref', '').split()
         if refs:
             for refid in refs:
-                for name, value in self.extract_property_refs(node, refid):
+                for name, value in self._extract_property_refs(
+                        node, refid, items_seen=items_seen, base_url=base_url):
                     properties[name].append(value)
 
-        for name, value in self.extract_properties(node):
+        for name, value in self._extract_properties(
+                node, items_seen=items_seen, base_url=base_url):
             properties[name].append(value)
 
         props = []
@@ -95,50 +98,55 @@ def extract_item(self, node):
             item["properties"] = dict(props)
         else:
             # item without properties; let's use the node itself
-            item["value"] = self.extract_property_value(node, force=True)
+            item["value"] = self._extract_property_value(
+                node, force=True, items_seen=items_seen, base_url=base_url)
 
         # not in the specs, but can be handy
         if self.add_text_content:
-            textContent = self.extract_textContent(node)
+            textContent = self._extract_textContent(node)
             if textContent:
                 item["textContent"] = textContent
 
         return item
 
-    def extract_properties(self, node):
+    def _extract_properties(self, node, items_seen, base_url):
         for prop in self._xp_prop(node):
-            for p, v in self.extract_property(prop):
+            for p, v in self._extract_property(
+                    prop, items_seen=items_seen, base_url=base_url):
                 yield p, v
 
-    def extract_property_refs(self, node, refid):
+    def _extract_property_refs(self, node, refid, items_seen, base_url):
         for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid):
-            for p, v in self.extract_property(prop):
+            for p, v in self._extract_property(
+                    prop, items_seen=items_seen, base_url=base_url):
                 yield p, v
 
-    def extract_property(self, node):
+    def _extract_property(self, node, items_seen, base_url):
         props = node.get("itemprop").split()
-        value = self.extract_property_value(node)
+        value = self._extract_property_value(
+            node, items_seen=items_seen, base_url=base_url)
         return [(p, value) for p in props]
 
-    def extract_property_value(self, node, force=False):
+    def _extract_property_value(self, node, items_seen, base_url, force=False):
         #http://www.w3.org/TR/microdata/#values
         if not force and node.get("itemscope") is not None:
             if self.nested:
-                return self.extract_item(node)
+                return self._extract_item(
+                    node, items_seen=items_seen, base_url=base_url)
             else:
                 return {"iid_ref": self.get_docid(node)}
 
         elif node.tag == "meta":
             return node.get("content", "")
 
         elif node.tag in ("audio", "embed", "iframe", "img", "source", "track", "video"):
-            return urljoin(self.url, strip_html5_whitespace(node.get("src", "")))
+            return urljoin(base_url, strip_html5_whitespace(node.get("src", "")))
 
         elif node.tag in ("a", "area", "link"):
-            return urljoin(self.url, strip_html5_whitespace(node.get("href", "")))
+            return urljoin(base_url, strip_html5_whitespace(node.get("href", "")))
 
         elif node.tag in ("object",):
-            return urljoin(self.url, strip_html5_whitespace(node.get("data", "")))
+            return urljoin(base_url, strip_html5_whitespace(node.get("data", "")))
 
         elif node.tag in ("data", "meter"):
             return node.get("value", "")
@@ -151,9 +159,9 @@ def extract_property_value(self, node, force=False):
             return node.get("content")
 
         else:
-            return self.extract_textContent(node)
+            return self._extract_textContent(node)
 
-    def extract_textContent(self, node):
+    def _extract_textContent(self, node):
         return u"".join(self._xp_clean_text(node)).strip()
 
 

diff --git a/tests/samples/schema.org/product_custom_url.json b/tests/samples/schema.org/product_custom_url.json
@@ -1,7 +1,7 @@
 [{"type": "http://schema.org/Product",
   "properties": {"brand": "ACME",
                  "name": "Executive Anvil",
-                 "image": "http://example.com/anvil_executive.jpg",
+                 "image": "http://some-example.com/anvil_executive.jpg",
                  "description": "Sleeker than ACME's Classic Anvil, the\n      Executive Anvil is perfect for the business traveler\n      looking for something to drop from a height.",
                  "mpn": "925872",
                  "aggregateRating": {"type": "http://schema.org/AggregateRating",