-
Notifications
You must be signed in to change notification settings - Fork 114
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #61 from scrapinghub/add_syntaxes
Add og and microformat extraction
- Loading branch information
Showing
23 changed files
with
1,624 additions
and
1,130 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,54 @@ | ||
import logging | ||
from lxml.html import fromstring | ||
|
||
from extruct.jsonld import JsonLdExtractor | ||
from extruct.rdfa import RDFaExtractor | ||
from extruct.w3cmicrodata import MicrodataExtractor | ||
from extruct.opengraph import OpenGraphExtractor | ||
from extruct.microformat import MicroformatExtractor | ||
from extruct.xmldom import XmlDomHTMLParser | ||
|
||
logger = logging.getLogger(__name__) | ||
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa'] | ||
|
||
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"): | ||
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8", | ||
syntaxes=SYNTAXES, | ||
errors='strict'): | ||
"""htmlstring: string with valid html document; | ||
url: url of the html documents | ||
encoding: encoding of the html document | ||
syntaxes: list of syntaxes to extract, default SYNTAXES | ||
errors: set to 'log' to save exceptions to file, 'ignore' to ignore them | ||
or 'strict'(default) to raise them""" | ||
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)): | ||
raise ValueError("syntaxes must be a list with any or all (default) of" | ||
"these values: {}".format(SYNTAXES)) | ||
if errors not in ['log', 'ignore', 'strict']: | ||
raise ValueError('Invalid error command, valid values are either "log"' | ||
', "ignore" or "strict"') | ||
domparser = XmlDomHTMLParser(encoding=encoding) | ||
tree = fromstring(htmlstring, parser=domparser) | ||
return {name: extractor.extract_items(tree, url=url) | ||
for name, extractor in ( | ||
('json-ld', JsonLdExtractor()), | ||
('microdata', MicrodataExtractor()), | ||
('rdfa', RDFaExtractor()))} | ||
processors = [] | ||
if 'microdata' in syntaxes: | ||
processors.append(('microdata', MicrodataExtractor().extract_items)) | ||
if 'json-ld' in syntaxes: | ||
processors.append(('json-ld', JsonLdExtractor().extract_items)) | ||
if 'opengraph' in syntaxes: | ||
processors.append(('opengraph', OpenGraphExtractor().extract_items)) | ||
if 'microformat' in syntaxes: | ||
processors.append(('microformat', MicroformatExtractor().extract_items)) | ||
if 'rdfa' in syntaxes: | ||
processors.append(('rdfa', RDFaExtractor().extract_items)) | ||
output = {} | ||
for label, extract in processors: | ||
try: | ||
output[label] = [obj for obj in extract(document=tree, | ||
url=url, | ||
html=htmlstring)] | ||
except Exception as e: | ||
if errors == 'log': | ||
logger.exception("Failed to parse %s", url) | ||
if errors == 'ignore': | ||
pass | ||
if errors == 'strict': | ||
raise e | ||
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
import mf2py | ||
|
||
class MicroformatExtractor(object): | ||
|
||
def extract(self, htmlstring, url='http://www.example.com/', encoding='UTF-8'): | ||
return list(self.extract_items(htmlstring, url=url)) | ||
|
||
def extract_items(self, html, url, document=None): | ||
for obj in mf2py.parse(html, html_parser="lxml", url=url)['items']: | ||
yield obj |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import re | ||
import lxml.html | ||
|
||
|
||
class OpenGraphExtractor(object): | ||
"""OpenGraph extractor following extruct API.""" | ||
|
||
def extract(self, htmlstring, url='http://www.example.com/', encoding='UTF-8'): | ||
parser = lxml.html.HTMLParser(encoding=encoding) | ||
doc = lxml.html.fromstring(htmlstring, parser=parser) | ||
return list(self.extract_items(doc)) | ||
|
||
def extract_items(self, document, *args, **kwargs): | ||
# OpenGraph defines a web page as a single rich object. | ||
# TODO: Handle known opengraph namespaces. | ||
for head in document.xpath('//head'): | ||
prefix = dict(re.findall(r'\s*(\w+): ([^\s]+)', head.attrib.get('prefix', ''))) | ||
prefix.setdefault('og', 'http://ogp.me/ns#') | ||
props = [] | ||
for el in head.xpath('meta[@property and @content]'): | ||
prop = el.attrib['property'] | ||
val = el.attrib['content'] | ||
ns = prop.partition(':')[0] | ||
if ns in prefix: | ||
props.append((prop, val)) | ||
if props: | ||
yield {'namespace': prefix, 'properties': props} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,65 +1,30 @@ | ||
import argparse | ||
import json | ||
|
||
import lxml | ||
import requests | ||
from extruct.jsonld import JsonLdExtractor | ||
from extruct.rdfa import RDFaExtractor | ||
from extruct.w3cmicrodata import MicrodataExtractor | ||
from extruct.xmldom import XmlDomHTMLParser | ||
|
||
import extruct | ||
from extruct import SYNTAXES | ||
|
||
def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True): | ||
def metadata_from_url(url, syntaxes=SYNTAXES): | ||
resp = requests.get(url, timeout=30) | ||
result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)} | ||
try: | ||
resp.raise_for_status() | ||
except requests.exceptions.HTTPError: | ||
return result | ||
|
||
parser = XmlDomHTMLParser(encoding=resp.encoding) | ||
tree = lxml.html.fromstring(resp.content, parser=parser) | ||
|
||
if microdata: | ||
mde = MicrodataExtractor(nested=True) | ||
result['microdata'] = mde.extract_items(tree, resp.url) | ||
|
||
if jsonld: | ||
jsonlde = JsonLdExtractor() | ||
result['json-ld'] = jsonlde.extract_items(tree, resp.url) | ||
|
||
if rdfa: | ||
rdfae = RDFaExtractor() | ||
result['rdfa'] = rdfae.extract_items(tree, resp.url) | ||
|
||
result.update(extruct.extract(resp.content, url=url, syntaxes=syntaxes)) | ||
return result | ||
|
||
|
||
def main(): | ||
def main(args=None): | ||
parser = argparse.ArgumentParser(prog='extruct', description=__doc__) | ||
parser.add_argument('url', help='The target URL') | ||
parser.add_argument( | ||
'--microdata', | ||
action='store_true', | ||
default=False, | ||
help='Extract W3C Microdata from the page.', | ||
) | ||
parser.add_argument( | ||
'--jsonld', | ||
action='store_true', | ||
default=False, | ||
help='Extract JSON-LD metadata from the page.', | ||
) | ||
parser.add_argument( | ||
'--rdfa', | ||
action='store_true', | ||
default=False, | ||
help='Extract RDFa metadata from the page.', | ||
) | ||
args = parser.parse_args() | ||
|
||
if any((args.microdata, args.jsonld, args.rdfa)): | ||
metadata = metadata_from_url(args.url, args.microdata, args.jsonld, args.rdfa) | ||
else: | ||
metadata = metadata_from_url(args.url) | ||
arg = parser.add_argument | ||
arg('url', help='The target URL') | ||
arg('--syntaxes', nargs='+', | ||
choices=SYNTAXES, | ||
default=SYNTAXES, | ||
help='List of syntaxes to extract. Valid values any or all (default):' | ||
'microdata, opengraph, microformat json-ld, rdfa.' | ||
'Example: --syntaxes microdata opengraph json-ld') | ||
args = parser.parse_args(args) | ||
metadata = metadata_from_url(args.url, args.syntaxes) | ||
return json.dumps(metadata, indent=2, sort_keys=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ gevent | |
requests | ||
rdflib | ||
rdflib-jsonld | ||
mf2py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,16 @@ | ||
# -*- coding: utf-8 -*- | ||
import os | ||
import json | ||
|
||
|
||
tests_datadir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'samples') | ||
|
||
def get_testdata(*paths): | ||
"""Return test data""" | ||
path = os.path.join(tests_datadir, *paths) | ||
return open(path, 'rb').read() | ||
with open(path, 'rb') as f_in: | ||
return f_in.read() | ||
|
||
|
||
def jsonize_dict(d): | ||
return json.loads(json.dumps(d)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | ||
<html xmlns="https://www.w3.org/1999/xhtml" xmlns:og="https://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml"> | ||
<head> | ||
<title>Himanshu's Open Graph Protocol</title> | ||
<meta http-equiv="Content-Type" content="text/html;charset=WINDOWS-1252" /> | ||
<meta http-equiv="Content-Language" content="en-us" /> | ||
<link rel="stylesheet" type="text/css" href="event-education.css" /> | ||
<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" > | ||
<meta property="og:title" content="Himanshu's Open Graph Protocol"/> | ||
<article class="h-entry"> | ||
<h1 class="p-name">Microformats are amazing</h1> | ||
<p>Published by <a class="p-author h-card" href="http://example.com">W. Developer</a> | ||
on <time class="dt-published" datetime="2013-06-13 12:00:00">13<sup>th</sup> June 2013</time></p> | ||
|
||
<p class="p-summary">In which I extoll the virtues of using microformats.</p> | ||
|
||
<div class="e-content"> | ||
<p>Blah blah blah</p> | ||
</div> | ||
</article> | ||
|
||
</head> | ||
|
||
<body> | ||
|
||
<div id="fb-root"></div> | ||
<script>(function(d, s, id) { | ||
var js, fjs = d.getElementsByTagName(s)[0]; | ||
if (d.getElementById(id)) return; | ||
js = d.createElement(s); js.id = id; | ||
js.src = "//connect.facebook.net/en_US/all.js#xfbml=1&appId=501839739845103"; | ||
fjs.parentNode.insertBefore(js, fjs); | ||
}(document, 'script', 'facebook-jssdk'));</script> | ||
. | ||
. | ||
. | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
[ | ||
{ | ||
"type": [ | ||
"h-entry" | ||
], | ||
"properties": { | ||
"name": [ | ||
"Microformats are amazing" | ||
], | ||
"author": [ | ||
{ | ||
"type": [ | ||
"h-card" | ||
], | ||
"properties": { | ||
"name": [ | ||
"W. Developer" | ||
], | ||
"url": [ | ||
"http://example.com" | ||
] | ||
}, | ||
"value": "W. Developer" | ||
} | ||
], | ||
"published": [ | ||
"2013-06-13 12:00:00" | ||
], | ||
"summary": [ | ||
"In which I extoll the virtues of using microformats." | ||
], | ||
"content": [ | ||
{ | ||
"html": "\n<p>Blah blah blah</p>\n", | ||
"value": "\nBlah blah blah\n" | ||
} | ||
] | ||
} | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | ||
<html xmlns="https://www.w3.org/1999/xhtml" xmlns:og="https://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml"> | ||
<head> | ||
<title>Himanshu's Open Graph Protocol</title> | ||
<meta http-equiv="Content-Type" content="text/html;charset=WINDOWS-1252" /> | ||
<meta http-equiv="Content-Language" content="en-us" /> | ||
<link rel="stylesheet" type="text/css" href="event-education.css" /> | ||
<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" > | ||
<meta property="og:title" content="Himanshu's Open Graph Protocol"/> | ||
<meta property="og:type" content="article"/> | ||
<meta property="og:url" content="https://www.eventeducation.com/test.php"/> | ||
<meta property="og:image" content="https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"/> | ||
<meta property="fb:admins" content="himanshu160"/> | ||
<meta property="og:site_name" content="Event Education"/> | ||
<meta property="og:description" content="Event Education provides free courses on event planning and management to event professionals worldwide."/> | ||
|
||
</head> | ||
|
||
<body> | ||
|
||
<div id="fb-root"></div> | ||
<script>(function(d, s, id) { | ||
var js, fjs = d.getElementsByTagName(s)[0]; | ||
if (d.getElementById(id)) return; | ||
js = d.createElement(s); js.id = id; | ||
js.src = "//connect.facebook.net/en_US/all.js#xfbml=1&appId=501839739845103"; | ||
fjs.parentNode.insertBefore(js, fjs); | ||
}(document, 'script', 'facebook-jssdk'));</script> | ||
. | ||
. | ||
. | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
[ | ||
{ | ||
"namespace": { | ||
"og": "http://ogp.me/ns#" | ||
}, | ||
"properties": [ | ||
[ | ||
"og:title", | ||
"Himanshu's Open Graph Protocol" | ||
], | ||
[ | ||
"og:type", | ||
"article" | ||
], | ||
[ | ||
"og:url", | ||
"https://www.eventeducation.com/test.php" | ||
], | ||
[ | ||
"og:image", | ||
"https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg" | ||
], | ||
[ | ||
"og:site_name", | ||
"Event Education" | ||
], | ||
[ | ||
"og:description", | ||
"Event Education provides free courses on event planning and management to event professionals worldwide." | ||
] | ||
] | ||
} | ||
] |
Oops, something went wrong.