Skip to content

Commit

Permalink
Merge pull request #71 from scrapinghub/fix-microdata-properties-urljoin
Browse files Browse the repository at this point in the history
Fix urljoin in microdata
  • Loading branch information
kmike committed Apr 4, 2018
2 parents 0c4d6dd + eda312c commit 54ef8e7
Show file tree
Hide file tree
Showing 17 changed files with 145 additions and 43 deletions.
2 changes: 1 addition & 1 deletion extruct/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
logger = logging.getLogger(__name__)
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']

def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8",
def extract(htmlstring, url=None, encoding="UTF-8",
syntaxes=SYNTAXES,
errors='strict'):
"""htmlstring: string with valid html document;
Expand Down
2 changes: 1 addition & 1 deletion extruct/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
class JsonLdExtractor(object):
_xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]')

def extract(self, htmlstring, url='http://www.example.com/', encoding="UTF-8"):
def extract(self, htmlstring, url=None, encoding="UTF-8"):
parser = lxml.html.HTMLParser(encoding=encoding)
lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
return self.extract_items(lxmldoc)
Expand Down
2 changes: 1 addition & 1 deletion extruct/microformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

class MicroformatExtractor(object):

def extract(self, htmlstring, url='http://www.example.com/', encoding='UTF-8'):
def extract(self, htmlstring, url=None, encoding='UTF-8'):
return list(self.extract_items(htmlstring, url=url))

def extract_items(self, html, url, document=None):
Expand Down
2 changes: 1 addition & 1 deletion extruct/opengraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
class OpenGraphExtractor(object):
"""OpenGraph extractor following extruct API."""

def extract(self, htmlstring, url='http://www.example.com/', encoding='UTF-8'):
def extract(self, htmlstring, url=None, encoding='UTF-8'):
parser = lxml.html.HTMLParser(encoding=encoding)
doc = lxml.html.fromstring(htmlstring, parser=parser)
return list(self.extract_items(doc))
Expand Down
2 changes: 1 addition & 1 deletion extruct/rdfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

class RDFaExtractor(object):

def extract(self, htmlstring, url='http://www.example.com/', encoding="UTF-8",
def extract(self, htmlstring, url=None, encoding="UTF-8",
expanded=True):

domparser = XmlDomHTMLParser(encoding=encoding)
Expand Down
10 changes: 5 additions & 5 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import lxml.etree
import lxml.html

from w3lib.html import strip_html5_whitespace

class LxmlMicrodataExtractor(object):
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
Expand All @@ -39,7 +39,7 @@ def __init__(self, nested=True, strict=False, add_text_content=False):
def get_docid(self, node):
return int(self._xp_item_docid(node))

def extract(self, htmlstring, url='http://www.example.com/', encoding="UTF-8"):
def extract(self, htmlstring, url=None, encoding="UTF-8"):
parser = lxml.html.HTMLParser(encoding=encoding)
lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
return self.extract_items(lxmldoc, url)
Expand Down Expand Up @@ -132,13 +132,13 @@ def extract_property_value(self, node, force=False):
return node.get("content", "")

elif node.tag in ("audio", "embed", "iframe", "img", "source", "track", "video"):
return urljoin(self.url, node.get("src", ""))
return urljoin(self.url, strip_html5_whitespace(node.get("src", "")))

elif node.tag in ("a", "area", "link"):
return urljoin(self.url, node.get("href", ""))
return urljoin(self.url, strip_html5_whitespace(node.get("href", "")))

elif node.tag in ("object",):
return urljoin(self.url, node.get("data", ""))
return urljoin(self.url, strip_html5_whitespace(node.get("data", "")))

elif node.tag in ("data", "meter"):
return node.get("value", "")
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ gevent
requests
rdflib
rdflib-jsonld
mf2py
mf2py
w3lib
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ def get_version():
},
packages=find_packages(exclude=['tests',]),
package_data={'extruct': ['VERSION']},
install_requires=['lxml', 'rdflib', 'rdflib-jsonld'],
install_requires=['lxml',
'rdflib',
'rdflib-jsonld',
'mf2py',
'w3lib'],
extras_require={
'service': [
'bottle',
Expand Down
4 changes: 2 additions & 2 deletions tests/samples/schema.org/Event.001.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
[{"properties": {"location": {"properties": {"address": {"properties": {"addressLocality": "Philadelphia",
"addressRegion": "PA"},
"type": "http://schema.org/PostalAddress"},
"url": "http://www.example.com/wells-fargo-center.html"},
"url": "wells-fargo-center.html"},
"type": "http://schema.org/Place"},
"name": "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)",
"offers": {"properties": {"lowPrice": "$35",
"offerCount": "1938"},
"type": "http://schema.org/AggregateOffer"},
"startDate": "2016-04-21T20:00",
"url": "http://www.example.com/nba-miami-philidelphia-game3.html"},
"url": "nba-miami-philidelphia-game3.html"},
"type": "http://schema.org/Event"}]
26 changes: 13 additions & 13 deletions tests/samples/schema.org/Event.002.json
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
[{"properties": {"event": [{"properties": {"location": "Memphis, TN, US",
"name": "FedExForum",
"offers": "http://www.example.com/ticketmaster.com/foofighters/may20-2011",
"offers": "ticketmaster.com/foofighters/may20-2011",
"startDate": "2011-05-20",
"url": "http://www.example.com/foo-fighters-may20-fedexforum"},
"url": "foo-fighters-may20-fedexforum"},
"type": "http://schema.org/Event"},
{"properties": {"location": "Council Bluffs, IA, US",
"name": "Mid America Center",
"offers": "http://www.example.com/ticketmaster.com/foofighters/may23-2011",
"offers": "ticketmaster.com/foofighters/may23-2011",
"startDate": "2011-05-23",
"url": "http://www.example.com/foo-fighters-may23-midamericacenter"},
"url": "foo-fighters-may23-midamericacenter"},
"type": "http://schema.org/Event"}],
"image": ["http://www.example.com/foofighters-1.jpg",
"http://www.example.com/foofighters-2.jpg",
"http://www.example.com/foofighters-3.jpg"],
"image": ["foofighters-1.jpg",
"foofighters-2.jpg",
"foofighters-3.jpg"],
"interactionStatistic": {"properties": {"interactionType": "http://schema.org/CommentAction",
"userInteractionCount": "18"},
"type": "http://schema.org/InteractionCounter"},
"name": "Foo Fighters",
"track": [{"properties": {"audio": "http://www.example.com/foo-fighters-rope-play.html",
"track": [{"properties": {"audio": "foo-fighters-rope-play.html",
"duration": "PT4M5S",
"inAlbum": "http://www.example.com/foo-fighters-wasting-light.html",
"inAlbum": "foo-fighters-wasting-light.html",
"interactionCount": {"properties": {"interactionType": "http://schema.org/ListenAction",
"userInteractionCount": "14300"},
"type": "http://schema.org/InteractionCounter"},
"name": "Rope",
"offers": "http://www.example.com/foo-fighters-rope-buy.html",
"offers": "foo-fighters-rope-buy.html",
"url": "foo-fighters-rope.html"},
"type": "http://schema.org/MusicRecording"},
{"properties": {"audio": "http://www.example.com/foo-fighters-everlong-play.html",
{"properties": {"audio": "foo-fighters-everlong-play.html",
"duration": "PT6M33S",
"inAlbum": "http://www.example.com/foo-fighters-color-and-shape.html",
"inAlbum": "foo-fighters-color-and-shape.html",
"interactionCount": {"properties": {"interactionType": "http://schema.org/ListenAction",
"userInteractionCount": "11700"},
"type": "http://schema.org/InteractionCounter"},
"name": "Everlong",
"offers": "http://www.example.com/foo-fighters-everlong-buy.html",
"offers": "foo-fighters-everlong-buy.html",
"url": "foo-fighters-everlong.html"},
"type": "http://schema.org/MusicRecording"}],
"video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.",
Expand Down
2 changes: 1 addition & 1 deletion tests/samples/schema.org/Event.008.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"sameAs": "http://www.shakespearesglobe.com/"},
"type": "http://schema.org/PerformingArtsTheater"},
"name": "Julius Caesar at Shakespeare's Globe",
"offers": {"properties": {"url": "http://www.example.com/examples/ticket/0012301230123"},
"offers": {"properties": {"url": "/examples/ticket/0012301230123"},
"type": "http://schema.org/Offer"},
"startDate": "2014-10-01T19:30",
"workPerformed": {"properties": {"creator": {"properties": {"name": "William Shakespeare",
Expand Down
26 changes: 13 additions & 13 deletions tests/samples/schema.org/MusicRecording.001.json
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
[{"properties": {"event": [{"properties": {"location": "Memphis, TN, US",
"name": "FedExForum",
"offers": "http://www.example.com/ticketmaster.com/foofighters/may20-2011",
"offers": "ticketmaster.com/foofighters/may20-2011",
"startDate": "2011-05-20",
"url": "http://www.example.com/foo-fighters-may20-fedexforum"},
"url": "foo-fighters-may20-fedexforum"},
"type": "http://schema.org/Event"},
{"properties": {"location": "Council Bluffs, IA, US",
"name": "Mid America Center",
"offers": "http://www.example.com/ticketmaster.com/foofighters/may23-2011",
"offers": "ticketmaster.com/foofighters/may23-2011",
"startDate": "2011-05-23",
"url": "http://www.example.com/foo-fighters-may23-midamericacenter"},
"url": "foo-fighters-may23-midamericacenter"},
"type": "http://schema.org/Event"}],
"image": ["http://www.example.com/foofighters-1.jpg",
"http://www.example.com/foofighters-2.jpg",
"http://www.example.com/foofighters-3.jpg"],
"image": ["foofighters-1.jpg",
"foofighters-2.jpg",
"foofighters-3.jpg"],
"interactionStatistic": {"properties": {"interactionType": "http://schema.org/CommentAction",
"userInteractionCount": "18"},
"type": "http://schema.org/InteractionCounter"},
"name": "Foo Fighters",
"track": [{"properties": {"audio": "http://www.example.com/foo-fighters-rope-play.html",
"track": [{"properties": {"audio": "foo-fighters-rope-play.html",
"duration": "PT4M5S",
"inAlbum": "http://www.example.com/foo-fighters-wasting-light.html",
"inAlbum": "foo-fighters-wasting-light.html",
"interactionCount": {"properties": {"interactionType": "http://schema.org/ListenAction",
"userInteractionCount": "14300"},
"type": "http://schema.org/InteractionCounter"},
"name": "Rope",
"offers": "http://www.example.com/foo-fighters-rope-buy.html",
"offers": "foo-fighters-rope-buy.html",
"url": "foo-fighters-rope.html"},
"type": "http://schema.org/MusicRecording"},
{"properties": {"audio": "http://www.example.com/foo-fighters-everlong-play.html",
{"properties": {"audio": "foo-fighters-everlong-play.html",
"duration": "PT6M33S",
"inAlbum": "http://www.example.com/foo-fighters-color-and-shape.html",
"inAlbum": "foo-fighters-color-and-shape.html",
"interactionCount": {"properties": {"interactionType": "http://schema.org/ListenAction",
"userInteractionCount": "11700"},
"type": "http://schema.org/InteractionCounter"},
"name": "Everlong",
"offers": "http://www.example.com/foo-fighters-everlong-buy.html",
"offers": "foo-fighters-everlong-buy.html",
"url": "foo-fighters-everlong.html"},
"type": "http://schema.org/MusicRecording"}],
"video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.",
Expand Down
38 changes: 38 additions & 0 deletions tests/samples/schema.org/product.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<!DOCTYPE HTML>
<html>
<head>
<title>Photo gallery</title>
</head>
<body>

<div itemscope itemtype="http://schema.org/Product">
<span itemprop="brand">ACME</span>
<span itemprop="name">Executive Anvil</span>
<img itemprop="image" src=" anvil_executive.jpg
" alt="Executive Anvil logo" />
<span itemprop="description">Sleeker than ACME's Classic Anvil, the
Executive Anvil is perfect for the business traveler
looking for something to drop from a height.
</span>
Product #: <span itemprop="mpn">925872</span>
<span itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
</span> reviews
</span>

<span itemprop="offers" itemscope itemtype="http://schema.org/Offer">
Regular price: $179.99
<meta itemprop="priceCurrency" content="USD" />
$<span itemprop="price">119.99 </span>
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
5 November!</time>)
Available from: <span itemprop="seller" itemscope itemtype="http://schema.org/Organization">
<span itemprop="name">Executive Objects</span>
</span>
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,
in excellent condition
<link itemprop="availability" href=" http://schema.org/InStock"/>In stock! Order now!
</span>
</div>
</body>
</html>
19 changes: 19 additions & 0 deletions tests/samples/schema.org/product.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[{"type": "http://schema.org/Product",
"properties": {"brand": "ACME",
"name": "Executive Anvil",
"image": "anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {"type": "http://schema.org/AggregateRating",
"properties": {"ratingValue": "4.4",
"reviewCount": "89"}},
"offers": {"type": "http://schema.org/Offer",
"properties": {"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "2020-11-05",
"seller": {"type": "http://schema.org/Organization",
"properties":{"name": "Executive Objects"}},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"}}
}
}]
19 changes: 19 additions & 0 deletions tests/samples/schema.org/product_custom_url.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[{"type": "http://schema.org/Product",
"properties": {"brand": "ACME",
"name": "Executive Anvil",
"image": "http://example.com/anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {"type": "http://schema.org/AggregateRating",
"properties": {"ratingValue": "4.4",
"reviewCount": "89"}},
"offers": {"type": "http://schema.org/Offer",
"properties": {"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "2020-11-05",
"seller": {"type": "http://schema.org/Organization",
"properties":{"name": "Executive Objects"}},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"}}
}
}]
4 changes: 2 additions & 2 deletions tests/samples/w3c/microdata.5.5.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[{"properties": {"title": ["The house I found."],
"work": ["http://www.example.com/images/house.jpeg"],
"work": ["images/house.jpeg"],
"license": ["http://www.opensource.org/licenses/mit-license.php"]},
"type": ["http://n.whatwg.org/work"]},
{"properties": {"title": ["The mailbox."],
"work": ["http://www.example.com/images/mailbox.jpeg"],
"work": ["images/mailbox.jpeg"],
"license": ["http://www.opensource.org/licenses/mit-license.php"]},
"type": ["http://n.whatwg.org/work"]}]

0 comments on commit 54ef8e7

Please sign in to comment.