Skip to content

Commit

Permalink
Merge pull request #81 from scrapinghub/add-option-to-return-html-nod…
Browse files Browse the repository at this point in the history
…e-during-extraction

[MRG+1] Add option to return html node during extraction
  • Loading branch information
jakubwasikowski committed Jun 6, 2018
2 parents 995f6b0 + fa06c83 commit e9c7456
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 23 deletions.
3 changes: 3 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ v0.5.0 (TBD)
* In ``w3microdata`` ``_extract_properties``, ``_extract_property_refs``,
``_extract_property``, ``_extract_property_value`` and ``_extract_item``
now need ``items_seen`` and ``url`` to be passed as arguments.
* Add argument ``return_html_node`` to ``extract``, it allows to return HTML
node with the result of metadata extraction. It is supported only by
microdata syntax.

Warning: backward-incompatible change:

Expand Down
36 changes: 36 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,42 @@ To do so set ``uniform=True`` when calling ``extract``, it's false by default fo

NB rdfa structure is not uniformed yet

Returning HTML node
+++++++++++++++++++

It is also possible to get references to HTML node for every extracted metadata item.
The feature is supported only by microdata syntax.

To use that, just set the ``return_html_node`` option of ``extract`` method to ``True``.
As the result, an additional key "nodeHtml" will be included in the result for every
item. Each node is of ``lxml.etree.Element`` type: ::

>>> r = requests.get('http://www.rugpadcorner.com/shop/no-muv/')
>>> base_url = get_base_url(r.text, r.url)
>>> data = extruct.extract(r.text, base_url, syntaxes=['microdata'], return_html_node=True)
>>>
>>> pp.pprint(data)
{ 'microdata': [ { 'htmlNode': <Element div at 0x7f10f8e6d3b8>,
'properties': { 'description': 'KEEP RUGS FLAT ON CARPET!\n'
'Not your thin sticky pad, '
'No-Muv is truly the best!',
'image': ['', ''],
'name': ['No-Muv', 'No-Muv'],
'offers': [ { 'htmlNode': <Element div at 0x7f10f8e6d138>,
'properties': { 'availability': 'http://schema.org/InStock',
'price': 'Price: '
'$45'},
'type': 'http://schema.org/Offer'},
{ 'htmlNode': <Element div at 0x7f10f8e60f48>,
'properties': { 'availability': 'http://schema.org/InStock',
'price': '(Select '
'Size/Shape '
'for '
'Pricing)'},
'type': 'http://schema.org/Offer'}],
'ratingValue': ['5.00', '5.00']},
'type': 'http://schema.org/Product'}]}

Single extractors
-----------------

Expand Down
7 changes: 6 additions & 1 deletion extruct/_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
syntaxes=SYNTAXES,
errors='strict',
uniform=False,
return_html_node=False,
schema_context='http://schema.org',
**kwargs):
"""htmlstring: string with valid html document;
Expand All @@ -34,6 +35,10 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
'@type': 'example_type',
/* All other the properties in keys here */
}
return_html_node: if True, it includes into the result a HTML node of
respective embedded metadata under 'htmlNode' key.
The feature is supported only by microdata syntax.
Each node is of `lxml.etree.Element` type.
schema_context: schema's context for current page"""
if base_url is None and 'url' in kwargs:
warnings.warn('"url" argument is deprecated, please use "base_url"',
Expand All @@ -51,7 +56,7 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
tree = fromstring(htmlstring, parser=domparser)
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor().extract_items, tree))
processors.append(('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
if 'opengraph' in syntaxes:
Expand Down
7 changes: 5 additions & 2 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ class LxmlMicrodataExtractor(object):
+ count(ancestor::*[@itemscope])
+ 1""")

def __init__(self, nested=True, strict=False, add_text_content=False):
def __init__(self, nested=True, strict=False, add_text_content=False, add_html_node=False):
self.nested = nested
self.strict = strict
self.add_text_content = add_text_content
self.add_html_node = add_html_node

def get_docid(self, node):
return int(self._xp_item_docid(node))
Expand Down Expand Up @@ -101,11 +102,13 @@ def _extract_item(self, node, items_seen, base_url):
item["value"] = self._extract_property_value(
node, force=True, items_seen=items_seen, base_url=base_url)

# not in the specs, but can be handy
# below are not in the specs, but can be handy
if self.add_text_content:
textContent = self._extract_textContent(node)
if textContent:
item["textContent"] = textContent
if self.add_html_node:
item["htmlNode"] = node

return item

Expand Down
15 changes: 15 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

tests_datadir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'samples')


def get_testdata(*paths):
"""Return test data"""
path = os.path.join(tests_datadir, *paths)
Expand All @@ -14,3 +15,17 @@ def get_testdata(*paths):

def jsonize_dict(d):
return json.loads(json.dumps(d))


def replace_node_ref_with_node_id(item):
if isinstance(item, list):
for i in item:
replace_node_ref_with_node_id(i)
if isinstance(item, dict):
for key in list(item):
val = item[key]
if key == "htmlNode":
item["_nodeId_"] = val.get("id")
del item[key]
else:
replace_node_ref_with_node_id(val)
14 changes: 7 additions & 7 deletions tests/samples/schema.org/CreativeWork.001.html
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,19 @@
}
</script>
<div>
<div itemscope itemtype="http://schema.org/Book" itemid="http://worldcat.org/entity/work/id/2292573321">
<div id="book1" itemscope itemtype="http://schema.org/Book" itemid="http://worldcat.org/entity/work/id/2292573321">
<h1><span itemprop="name">Rouge et le noir</span></h1>
<div>Author: <span property="itemprop" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/17823">Stendhal</span></div>
<div>Author: <span id="author1" property="itemprop" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/17823">Stendhal</span></div>
<div>Language: <span itemprop="inLanguage" content="fr">French</span></div>
<div>Has Translation: <span itemprop="workTranslation" itemscope itemtype="http://schema.org/CreativeWork" itemid="http://worldcat.org/entity/work/id/460647">Red and Black : A New Translation, Backgrounds and Sources, Criticism</span></div>
<div>Has Translation: <span id="creativeWork1" itemprop="workTranslation" itemscope itemtype="http://schema.org/CreativeWork" itemid="http://worldcat.org/entity/work/id/460647">Red and Black : A New Translation, Backgrounds and Sources, Criticism</span></div>
</div>
<div itemscope itemtype="http://schema.org/Book" itemid="http://worldcat.org/entity/work/id/460647">
<div id="book2" itemscope itemtype="http://schema.org/Book" itemid="http://worldcat.org/entity/work/id/460647">
<h1><span itemprop="name">Red and Black : A New Translation, Backgrounds and Sources, Criticism</span></h1>
<div>Author: <span itemprop="author" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/17823">Stendhal</span></div>
<div>Author: <span id="author2" itemprop="author" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/17823">Stendhal</span></div>
<div>Language: <span itemprop="inLanguage" content="en">English</span></div>
<div>Subject: <span itemprop="about">Psychological fiction, French</span></div>
<div>Translation of: <span itemprop="translationOfWork" itemscope itemtype="http://schema.org/CreativeWork" itemid="http://worldcat.org/entity/work/id/2292573321">Rouge et le noir</span></div>
<div>Translator: <span property="itemprop" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/8453420">Robert Martin Adams</span></div>
<div>Translation of: <span id="creativeWork2" itemprop="translationOfWork" itemscope itemtype="http://schema.org/CreativeWork" itemid="http://worldcat.org/entity/work/id/2292573321">Rouge et le noir</span></div>
<div>Translator: <span id="translator2" property="itemprop" itemscope itemtype="http://schema.org/Person" itemid="http://viaf.org/viaf/8453420">Robert Martin Adams</span></div>
</div>
</div>
</body>
Expand Down
107 changes: 107 additions & 0 deletions tests/samples/schema.org/CreativeWork_flat_with_node_id.001.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"microdata": [
{
"_nodeId_": "book1",
"@type": "Book",
"@context": "http://schema.org",
"id": "http://worldcat.org/entity/work/id/2292573321",
"name": "Rouge et le noir",
"inLanguage": "fr",
"workTranslation": {
"_nodeId_": "creativeWork1",
"@type": "CreativeWork",
"id": "http://worldcat.org/entity/work/id/460647",
"value": "Red and Black : A New Translation, Backgrounds and Sources, Criticism"
}
},
{
"_nodeId_": "author1",
"@type": "Person",
"@context": "http://schema.org",
"id": "http://viaf.org/viaf/17823",
"value": "Stendhal"
},
{
"_nodeId_": "book2",
"@type": "Book",
"@context": "http://schema.org",
"id": "http://worldcat.org/entity/work/id/460647",
"name": "Red and Black : A New Translation, Backgrounds and Sources, Criticism",
"author": {
"_nodeId_": "author2",
"@type": "Person",
"id": "http://viaf.org/viaf/17823",
"value": "Stendhal"
},
"inLanguage": "en",
"about": "Psychological fiction, French",
"translationOfWork": {
"_nodeId_": "creativeWork2",
"@type": "CreativeWork",
"id": "http://worldcat.org/entity/work/id/2292573321",
"value": "Rouge et le noir"
}
},
{
"_nodeId_": "translator2",
"@type": "Person",
"@context": "http://schema.org",
"id": "http://viaf.org/viaf/8453420",
"value": "Robert Martin Adams"
}
],
"json-ld": [
{
"@context": "http://schema.org",
"@type": "WebPage",
"breadcrumb": "Books > Literature & Fiction > Classics",
"mainEntity": {
"@type": "Book",
"author": "/author/jd_salinger.html",
"bookFormat": "http://schema.org/Paperback",
"datePublished": "1991-05-01",
"image": "catcher-in-the-rye-book-cover.jpg",
"inLanguage": "English",
"isbn": "0316769487",
"name": "The Catcher in the Rye",
"numberOfPages": "224",
"offers": {
"@type": "Offer",
"availability": "http://schema.org/InStock",
"price": "6.99",
"priceCurrency": "USD"
},
"publisher": "Little, Brown, and Company",
"aggregateRating": {
"@type": "AggregateRating",
"ratingValue": "4",
"reviewCount": "3077"
},
"review": [
{
"@type": "Review",
"author": "John Doe",
"datePublished": "2006-05-04",
"name": "A masterpiece of literature",
"reviewBody": "I really enjoyed this book. It captures the essential challenge people face as they try make sense of their lives and grow to adulthood.",
"reviewRating": {
"@type": "Rating",
"ratingValue": "5"
}
},
{
"@type": "Review",
"author": "Bob Smith",
"datePublished": "2006-06-15",
"name": "A good read.",
"reviewBody": "Catcher in the Rye is a fun book. It's a good book to read.",
"reviewRating": "4"
}
]
}
}
],
"opengraph": [],
"microformat": [],
"rdfa": []
}
8 changes: 4 additions & 4 deletions tests/samples/schema.org/product.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
</head>
<body>

<div itemscope itemtype="http://schema.org/Product">
<div id="product" itemscope itemtype="http://schema.org/Product">
<span itemprop="brand">ACME</span>
<span itemprop="name">Executive Anvil</span>
<img itemprop="image" src=" anvil_executive.jpg
Expand All @@ -15,18 +15,18 @@
looking for something to drop from a height.
</span>
Product #: <span itemprop="mpn">925872</span>
<span itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
<span id="aggregateRating" itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
</span> reviews
</span>

<span itemprop="offers" itemscope itemtype="http://schema.org/Offer">
<span id="offer" itemprop="offers" itemscope itemtype="http://schema.org/Offer">
Regular price: $179.99
<meta itemprop="priceCurrency" content="USD" />
$<span itemprop="price">119.99 </span>
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
5 November!</time>)
Available from: <span itemprop="seller" itemscope itemtype="http://schema.org/Organization">
Available from: <span id="organization" itemprop="seller" itemscope itemtype="http://schema.org/Organization">
<span itemprop="name">Executive Objects</span>
</span>
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,
Expand Down
23 changes: 23 additions & 0 deletions tests/samples/schema.org/product_custom_url_and_node_id.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[{"type": "http://schema.org/Product",
"_nodeId_": "product",
"properties": {"brand": "ACME",
"name": "Executive Anvil",
"image": "http://some-example.com/anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {"type": "http://schema.org/AggregateRating",
"_nodeId_": "aggregateRating",
"properties": {"ratingValue": "4.4",
"reviewCount": "89"}},
"offers": {"type": "http://schema.org/Offer",
"_nodeId_": "offer",
"properties": {"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "2020-11-05",
"seller": {"type": "http://schema.org/Organization",
"_nodeId_": "organization",
"properties":{"name": "Executive Objects"}},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"}}
}
}]
19 changes: 13 additions & 6 deletions tests/test_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import unittest

import extruct
from tests import get_testdata, jsonize_dict
from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id


class TestGeneric(unittest.TestCase):
Expand All @@ -17,25 +17,32 @@ def test_all(self):
self.assertEqual(jsonize_dict(data), expected)

def test_microdata_custom_url(self):
body, expected = self._microdata_custom_url()
body, expected = self._microdata_custom_url('product_custom_url.json')
data = extruct.extract(body, base_url='http://some-example.com',
syntaxes=['microdata'])
self.assertEqual(data, expected)

def test_microdata_with_returning_node(self):
body, expected = self._microdata_custom_url('product_custom_url_and_node_id.json')
data = extruct.extract(body, base_url='http://some-example.com',
syntaxes=['microdata'], return_html_node=True)
replace_node_ref_with_node_id(data)
self.assertEqual(data, expected)

def test_deprecated_url(self):
body, expected = self._microdata_custom_url()
body, expected = self._microdata_custom_url('product_custom_url.json')
data = extruct.extract(body, url='http://some-example.com',
syntaxes=['microdata'])
self.assertEqual(data, expected)

def test_extra_kwargs(self):
body, expected = self._microdata_custom_url()
body, expected = self._microdata_custom_url('product_custom_url.json')
with self.assertRaises(TypeError):
extruct.extract(body, foo='bar')

def _microdata_custom_url(self):
def _microdata_custom_url(self, test_file):
body = get_testdata('schema.org', 'product.html')
expected = {'microdata': json.loads(
get_testdata('schema.org', 'product_custom_url.json')
get_testdata('schema.org', test_file)
.decode('UTF-8'))}
return body, expected

0 comments on commit e9c7456

Please sign in to comment.