Skip to content

Commit

Permalink
Merge pull request #119 from scrapinghub/fix-incorrectly-formatted-de…
Browse files Browse the repository at this point in the history
…scription-property

Fix incorrectly formatted description property
  • Loading branch information
jakubwasikowski committed Jul 19, 2019
2 parents 50a0915 + 670702e commit 6df8e19
Show file tree
Hide file tree
Showing 14 changed files with 3,133 additions and 13 deletions.
24 changes: 23 additions & 1 deletion extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,32 @@
from urllib.parse import urljoin

import lxml.etree
from lxml.html.clean import Cleaner
from w3lib.html import strip_html5_whitespace
import html_text

from extruct.utils import parse_html


# Cleaner which is similar to html_text cleaner, but is less aggressive
cleaner = Cleaner(
scripts=True,
javascript=False, # onclick attributes are fine
comments=True,
style=True,
links=True,
meta=True,
page_structure=False, # <title> may be nice to have
processing_instructions=True,
embedded=False, # keep embedded content
frames=False, # keep frames
forms=False, # keep forms
annoying_tags=False,
remove_unknown_tags=False,
safe_attrs_only=False,
)


class LxmlMicrodataExtractor(object):
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
Expand Down Expand Up @@ -182,7 +203,8 @@ def _extract_property_value(self, node, items_seen, base_url, force=False):
return self._extract_textContent(node)

def _extract_textContent(self, node):
return u"".join(self._xp_clean_text(node)).strip()
clean_node = cleaner.clean_html(node)
return html_text.etree_to_text(clean_node)


MicrodataExtractor = LxmlMicrodataExtractor
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ requests
rdflib
rdflib-jsonld
mf2py>=1.1.0
six
six>=1.11
w3lib
html-text
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def get_version():
'rdflib-jsonld',
'mf2py',
'w3lib',
'html-text>=0.5.1',
'six'],
extras_require={
'service': [
Expand Down
2 changes: 1 addition & 1 deletion tests/samples/schema.org/Event.002.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"offers": "foo-fighters-everlong-buy.html",
"url": "foo-fighters-everlong.html"},
"type": "http://schema.org/MusicRecording"}],
"video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.",
"video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.",
"duration": "T1M33S",
"name": "Interview with the Foo Fighters",
"thumbnail": "foo-fighters-interview-thumb.jpg"},
Expand Down
2 changes: 1 addition & 1 deletion tests/samples/schema.org/MusicRecording.001.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"offers": "foo-fighters-everlong-buy.html",
"url": "foo-fighters-everlong.html"},
"type": "http://schema.org/MusicRecording"}],
"video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.",
"video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.",
"duration": "T1M33S",
"name": "Interview with the Foo Fighters",
"thumbnail": "foo-fighters-interview-thumb.jpg"},
Expand Down
2 changes: 1 addition & 1 deletion tests/samples/schema.org/product-ref.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
],
"brand": "ACME",
"name": "Executive Anvil",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {
"type": "http://schema.org/AggregateRating",
Expand Down
2 changes: 1 addition & 1 deletion tests/samples/schema.org/product.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"properties": {"brand": "ACME",
"name": "Executive Anvil",
"image": "anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {"type": "http://schema.org/AggregateRating",
"properties": {"ratingValue": "4.4",
Expand Down
2 changes: 1 addition & 1 deletion tests/samples/schema.org/product_custom_url.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"properties": {"brand": "ACME",
"name": "Executive Anvil",
"image": "http://some-example.com/anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {"type": "http://schema.org/AggregateRating",
"properties": {"ratingValue": "4.4",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"properties": {"brand": "ACME",
"name": "Executive Anvil",
"image": "http://some-example.com/anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {"type": "http://schema.org/AggregateRating",
"_nodeId_": "aggregateRating",
Expand Down
6 changes: 3 additions & 3 deletions tests/samples/w3c/microdata.5.2.withtext.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
"name": "Tank Locomotive (DB 80)",
"product-code": "33041",
"scale": "HO"},
"textContent": "Name:\n Tank Locomotive (DB 80)\n Product code:\n 33041\n Scale:\n HO\n Digital:\n Delta",
"textContent": "Name:\nTank Locomotive (DB 80)\nProduct code:\n33041\nScale:\nHO\nDigital:\nDelta",
"type": ["http://md.example.com/loco",
"http://md.example.com/lighting"]},
{"properties": {"name": "Turnout Lantern Kit",
"product-code": "74470",
"scale": "HO",
"track-type": "C"},
"textContent": "Name:\n Turnout Lantern Kit\n Product code:\n 74470\n Purpose:\n For retrofitting 2 C Track\n turnouts.",
"textContent": "Name:\nTurnout Lantern Kit\nProduct code:\n74470\nPurpose:\nFor retrofitting 2 C Track turnouts.",
"type": ["http://md.example.com/track",
"http://md.example.com/lighting"]},
{"properties": {"name": "Express Train Passenger Car (DB Am 203)",
"product-code": "8710",
"scale": "Z"},
"textContent": "Name:\n Express Train Passenger Car (DB Am 203)\n Product code:\n 8710\n Scale:\n Z",
"textContent": "Name:\nExpress Train Passenger Car (DB Am 203)\nProduct code:\n8710\nScale:\nZ",
"type": "http://md.example.com/passengers"}]

0 comments on commit 6df8e19

Please sign in to comment.