Skip to content

Commit

Permalink
Merge pull request #79 from scrapinghub/base-url
Browse files Browse the repository at this point in the history
Use base_url instead of url
  • Loading branch information
kmike committed May 15, 2018
2 parents f76b918 + 485dd26 commit e0d2d22
Show file tree
Hide file tree
Showing 11 changed files with 119 additions and 73 deletions.
19 changes: 12 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ Usage
All-in-one extraction
+++++++++++++++++++++

The simplest example how to use extruct is to call ``extruct.extract(htmlstring, url)``
with some HTML string and a URL.
The simplest example how to use extruct is to call
``extruct.extract(htmlstring, base_url=base_url)``
with some HTML string and an optional base URL.

Let's try this on a webpage that uses all the syntaxes supported (RDFa with `ogp`_).

Expand All @@ -66,10 +67,12 @@ First fetch the HTML using python-requests and then feed the response body to ``
>>> import extruct
>>> import requests
>>> import pprint
>>> from w3lib.html import get_base_url
>>>
>>> pp = pprint.PrettyPrinter(indent=2)
>>> r = requests.get('https://www.optimizesmart.com/how-to-use-open-graph-protocol/')
>>> data = extruct.extract(r.text, r.url)
>>> base_url = get_base_url(r.text, r.url)
>>> data = extruct.extract(r.text, base_url=base_url)
>>>
>>> pp.pprint(data)
{ 'json-ld': [ { '@context': 'https://schema.org',
Expand Down Expand Up @@ -167,7 +170,8 @@ Select syntaxes
It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned::

>>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
>>> data = extruct.extract(r.text, r.url, syntaxes=['microdata', 'opengraph', 'rdfa'])
>>> base_url = get_base_url(r.text, r.url)
>>> data = extruct.extract(r.text, base_url, syntaxes=['microdata', 'opengraph', 'rdfa'])
>>>
>>> pp.pprint(data)
{ 'microdata': [],
Expand Down Expand Up @@ -217,7 +221,8 @@ Another option is to uniform the output of microformat, opengraph, microdata and
To do so set ``uniform=True`` when calling ``extract``, it's false by default for backward compatibility. Here the same example as before but with uniform set to True: ::

>>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
>>> data = extruct.extract(r.text, r.url, syntaxes=['microdata', 'opengraph', 'rdfa'], uniform=True)
>>> base_url = get_base_url(r.text, r.url)
>>> data = extruct.extract(r.text, base_url, syntaxes=['microdata', 'opengraph', 'rdfa'], uniform=True)
>>>
>>> pp.pprint(data)
{ 'microdata': [],
Expand Down Expand Up @@ -387,7 +392,7 @@ RDFa extraction (experimental)
... """
>>>
>>> rdfae = RDFaExtractor()
>>> pp.pprint(rdfae.extract(html, url='http://www.example.com/index.html'))
>>> pp.pprint(rdfae.extract(html, base_url='http://www.example.com/index.html'))
[{'@id': 'http://www.example.com/alice/posts/trouble_with_bob',
'@type': ['http://schema.org/BlogPosting'],
'http://purl.org/dc/terms/creator': [{'@id': 'http://www.example.com/index.html#me'}],
Expand Down Expand Up @@ -441,7 +446,7 @@ Open Graph extraction
... </html>"""
>>>
>>> opengraphe = OpenGraphExtractor()
>>> pp.pprint(opengraphe.extract(html, url='http://www.example.com/index.html'))
>>> pp.pprint(opengraphe.extract(html))
[{"namespace": {
"og": "http://ogp.me/ns#"
},
Expand Down
32 changes: 19 additions & 13 deletions extruct/_extruct.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import warnings

from lxml.html import fromstring

Expand All @@ -15,13 +16,14 @@
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']


def extract(htmlstring, url=None, encoding="UTF-8",
def extract(htmlstring, base_url=None, encoding="UTF-8",
syntaxes=SYNTAXES,
errors='strict',
uniform=False,
schema_context='http://schema.org'):
schema_context='http://schema.org',
**kwargs):
"""htmlstring: string with valid html document;
url: url of the html documents
base_url: base url of the html document
encoding: encoding of the html document
syntaxes: list of syntaxes to extract, default SYNTAXES
errors: set to 'log' to log the exceptions, 'ignore' to ignore them
Expand All @@ -33,6 +35,12 @@ def extract(htmlstring, url=None, encoding="UTF-8",
/* All other the properties in keys here */
}
schema_context: schema's context for current page"""
if base_url is None and 'url' in kwargs:
warnings.warn('"url" argument is deprecated, please use "base_url"',
DeprecationWarning)
base_url = kwargs.pop('url')
if kwargs:
raise TypeError('Unexpected keyword arguments')
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
Expand All @@ -43,24 +51,22 @@ def extract(htmlstring, url=None, encoding="UTF-8",
tree = fromstring(htmlstring, parser=domparser)
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor().extract_items))
processors.append(('microdata', MicrodataExtractor().extract_items, tree))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items))
processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items))
processors.append(('opengraph', OpenGraphExtractor().extract_items, tree))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items))
processors.append(('microformat', MicroformatExtractor().extract_items, htmlstring))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items))
processors.append(('rdfa', RDFaExtractor().extract_items, tree))
output = {}
for label, extract in processors:
for label, extract, document in processors:
try:
output[label] = [obj for obj in extract(document=tree,
url=url,
html=htmlstring)]
output[label] = list(extract(document, base_url=base_url))
except Exception:
if errors == 'log':
logger.exception("Failed to extract {} from {}".format(label, url))
logger.exception('Failed to extract {}'.format(label))
if errors == 'ignore':
pass
if errors == 'strict':
Expand Down
7 changes: 4 additions & 3 deletions extruct/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@

HTML_OR_JS_COMMENTLINE = re.compile('^\s*(//.*|<!--.*-->)')


class JsonLdExtractor(object):
_xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]')

def extract(self, htmlstring, url=None, encoding="UTF-8"):
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
parser = lxml.html.HTMLParser(encoding=encoding)
lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
return self.extract_items(lxmldoc)
return self.extract_items(lxmldoc, base_url=base_url)

def extract_items(self, document, *args, **kwargs):
def extract_items(self, document, base_url=None):
return [item for items in map(self._extract_items,
self._xp_jsonld(document))
for item in items
Expand Down
9 changes: 5 additions & 4 deletions extruct/microformat.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import mf2py


class MicroformatExtractor(object):

def extract(self, htmlstring, url=None, encoding='UTF-8'):
return list(self.extract_items(htmlstring, url=url))
def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
return list(self.extract_items(htmlstring, base_url=base_url))

def extract_items(self, html, url, document=None):
for obj in mf2py.parse(html, html_parser="lxml", url=url)['items']:
def extract_items(self, html, base_url=None):
for obj in mf2py.parse(html, html_parser="lxml", url=base_url)['items']:
yield obj
6 changes: 3 additions & 3 deletions extruct/opengraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
class OpenGraphExtractor(object):
"""OpenGraph extractor following extruct API."""

def extract(self, htmlstring, url=None, encoding='UTF-8'):
def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
parser = lxml.html.HTMLParser(encoding=encoding)
doc = lxml.html.fromstring(htmlstring, parser=parser)
return list(self.extract_items(doc))
return list(self.extract_items(doc, base_url=base_url))

def extract_items(self, document, *args, **kwargs):
def extract_items(self, document, base_url=None):
# OpenGraph defines a web page as a single rich object.
# TODO: Handle known opengraph namespaces.
for head in document.xpath('//head'):
Expand Down
10 changes: 5 additions & 5 deletions extruct/rdfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@

class RDFaExtractor(object):

def extract(self, htmlstring, url=None, encoding="UTF-8",
expanded=True):
def extract(self, htmlstring, base_url=None, encoding="UTF-8",
expanded=True):

domparser = XmlDomHTMLParser(encoding=encoding)
tree = fromstring(htmlstring, parser=domparser)
return self.extract_items(tree, url, expanded=expanded)
return self.extract_items(tree, base_url=base_url, expanded=expanded)

def extract_items(self, document, url, expanded=True, *args, **kwargs):
def extract_items(self, document, base_url=None, expanded=True):
options = Options(output_processor_graph=True,
embedded_rdf=False,
space_preserve=True,
Expand All @@ -46,6 +46,6 @@ def extract_items(self, document, url, expanded=True, *args, **kwargs):
refresh_vocab_cache=False,
check_lite=False)

g = PyRdfa(options, base=url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8')
return json.loads(jsonld_string)
66 changes: 37 additions & 29 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import lxml.html
from w3lib.html import strip_html5_whitespace


class LxmlMicrodataExtractor(object):
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
Expand All @@ -39,26 +40,26 @@ def __init__(self, nested=True, strict=False, add_text_content=False):
def get_docid(self, node):
return int(self._xp_item_docid(node))

def extract(self, htmlstring, url=None, encoding="UTF-8"):
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
parser = lxml.html.HTMLParser(encoding=encoding)
lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
return self.extract_items(lxmldoc, url)
return self.extract_items(lxmldoc, base_url)

def extract_items(self, document, url, *args, **kwargs):
self.url = url
self.items_seen = set()
return [item
for item in map(self.extract_item,
self._xp_item(document))
if item]
def extract_items(self, document, base_url):
items_seen = set()
return [
item for item in (
self._extract_item(it, items_seen=items_seen, base_url=base_url)
for it in self._xp_item(document))
if item]

def extract_item(self, node):
def _extract_item(self, node, items_seen, base_url):
itemid = self.get_docid(node)

if self.nested:
if itemid in self.items_seen:
if itemid in items_seen:
return
self.items_seen.add(itemid)
items_seen.add(itemid)

item = {}
if not self.nested:
Expand All @@ -79,10 +80,12 @@ def extract_item(self, node):
refs = node.get('itemref', '').split()
if refs:
for refid in refs:
for name, value in self.extract_property_refs(node, refid):
for name, value in self._extract_property_refs(
node, refid, items_seen=items_seen, base_url=base_url):
properties[name].append(value)

for name, value in self.extract_properties(node):
for name, value in self._extract_properties(
node, items_seen=items_seen, base_url=base_url):
properties[name].append(value)

props = []
Expand All @@ -95,50 +98,55 @@ def extract_item(self, node):
item["properties"] = dict(props)
else:
# item without properties; let's use the node itself
item["value"] = self.extract_property_value(node, force=True)
item["value"] = self._extract_property_value(
node, force=True, items_seen=items_seen, base_url=base_url)

# not in the specs, but can be handy
if self.add_text_content:
textContent = self.extract_textContent(node)
textContent = self._extract_textContent(node)
if textContent:
item["textContent"] = textContent

return item

def extract_properties(self, node):
def _extract_properties(self, node, items_seen, base_url):
for prop in self._xp_prop(node):
for p, v in self.extract_property(prop):
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url):
yield p, v

def extract_property_refs(self, node, refid):
def _extract_property_refs(self, node, refid, items_seen, base_url):
for prop in node.xpath("id($refid)/descendant-or-self::*[@itemprop]", refid=refid):
for p, v in self.extract_property(prop):
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url):
yield p, v

def extract_property(self, node):
def _extract_property(self, node, items_seen, base_url):
props = node.get("itemprop").split()
value = self.extract_property_value(node)
value = self._extract_property_value(
node, items_seen=items_seen, base_url=base_url)
return [(p, value) for p in props]

def extract_property_value(self, node, force=False):
def _extract_property_value(self, node, items_seen, base_url, force=False):
#http://www.w3.org/TR/microdata/#values
if not force and node.get("itemscope") is not None:
if self.nested:
return self.extract_item(node)
return self._extract_item(
node, items_seen=items_seen, base_url=base_url)
else:
return {"iid_ref": self.get_docid(node)}

elif node.tag == "meta":
return node.get("content", "")

elif node.tag in ("audio", "embed", "iframe", "img", "source", "track", "video"):
return urljoin(self.url, strip_html5_whitespace(node.get("src", "")))
return urljoin(base_url, strip_html5_whitespace(node.get("src", "")))

elif node.tag in ("a", "area", "link"):
return urljoin(self.url, strip_html5_whitespace(node.get("href", "")))
return urljoin(base_url, strip_html5_whitespace(node.get("href", "")))

elif node.tag in ("object",):
return urljoin(self.url, strip_html5_whitespace(node.get("data", "")))
return urljoin(base_url, strip_html5_whitespace(node.get("data", "")))

elif node.tag in ("data", "meter"):
return node.get("value", "")
Expand All @@ -151,9 +159,9 @@ def extract_property_value(self, node, force=False):
return node.get("content")

else:
return self.extract_textContent(node)
return self._extract_textContent(node)

def extract_textContent(self, node):
def _extract_textContent(self, node):
return u"".join(self._xp_clean_text(node)).strip()


Expand Down
2 changes: 1 addition & 1 deletion tests/samples/schema.org/product_custom_url.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[{"type": "http://schema.org/Product",
"properties": {"brand": "ACME",
"name": "Executive Anvil",
"image": "http://example.com/anvil_executive.jpg",
"image": "http://some-example.com/anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {"type": "http://schema.org/AggregateRating",
Expand Down

0 comments on commit e0d2d22

Please sign in to comment.