Skip to content

Commit

Permalink
Merge pull request #61 from scrapinghub/add_syntaxes
Browse files Browse the repository at this point in the history
Add og and microformat extraction
  • Loading branch information
Kebniss committed Mar 28, 2018
2 parents 76ad2de + f12733b commit 9e86435
Show file tree
Hide file tree
Showing 23 changed files with 1,624 additions and 1,130 deletions.
878 changes: 551 additions & 327 deletions README.rst

Large diffs are not rendered by default.

52 changes: 45 additions & 7 deletions extruct/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,54 @@
import logging
from lxml.html import fromstring

from extruct.jsonld import JsonLdExtractor
from extruct.rdfa import RDFaExtractor
from extruct.w3cmicrodata import MicrodataExtractor
from extruct.opengraph import OpenGraphExtractor
from extruct.microformat import MicroformatExtractor
from extruct.xmldom import XmlDomHTMLParser

logger = logging.getLogger(__name__)
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']

def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"):
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8",
syntaxes=SYNTAXES,
errors='strict'):
"""htmlstring: string with valid html document;
url: url of the html documents
encoding: encoding of the html document
syntaxes: list of syntaxes to extract, default SYNTAXES
errors: set to 'log' to save exceptions to file, 'ignore' to ignore them
or 'strict'(default) to raise them"""
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
domparser = XmlDomHTMLParser(encoding=encoding)
tree = fromstring(htmlstring, parser=domparser)
return {name: extractor.extract_items(tree, url=url)
for name, extractor in (
('json-ld', JsonLdExtractor()),
('microdata', MicrodataExtractor()),
('rdfa', RDFaExtractor()))}
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor().extract_items))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items))
output = {}
for label, extract in processors:
try:
output[label] = [obj for obj in extract(document=tree,
url=url,
html=htmlstring)]
except Exception as e:
if errors == 'log':
logger.exception("Failed to parse %s", url)
if errors == 'ignore':
pass
if errors == 'strict':
raise e
return output
10 changes: 10 additions & 0 deletions extruct/microformat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import mf2py

class MicroformatExtractor(object):

def extract(self, htmlstring, url='http://www.example.com/', encoding='UTF-8'):
return list(self.extract_items(htmlstring, url=url))

def extract_items(self, html, url, document=None):
for obj in mf2py.parse(html, html_parser="lxml", url=url)['items']:
yield obj
27 changes: 27 additions & 0 deletions extruct/opengraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import re
import lxml.html


class OpenGraphExtractor(object):
"""OpenGraph extractor following extruct API."""

def extract(self, htmlstring, url='http://www.example.com/', encoding='UTF-8'):
parser = lxml.html.HTMLParser(encoding=encoding)
doc = lxml.html.fromstring(htmlstring, parser=parser)
return list(self.extract_items(doc))

def extract_items(self, document, *args, **kwargs):
# OpenGraph defines a web page as a single rich object.
# TODO: Handle known opengraph namespaces.
for head in document.xpath('//head'):
prefix = dict(re.findall(r'\s*(\w+): ([^\s]+)', head.attrib.get('prefix', '')))
prefix.setdefault('og', 'http://ogp.me/ns#')
props = []
for el in head.xpath('meta[@property and @content]'):
prop = el.attrib['property']
val = el.attrib['content']
ns = prop.partition(':')[0]
if ns in prefix:
props.append((prop, val))
if props:
yield {'namespace': prefix, 'properties': props}
4 changes: 2 additions & 2 deletions extruct/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ def _decorated(*args, **kwargs):
return _decorated


def async_extruct(url, microdata=True, jsonld=True, rdfa=True):
def async_extruct(url, **kwargs):
response.content_type = 'application/json'
result = metadata_from_url(url, microdata, jsonld, rdfa)
result = metadata_from_url(url, **kwargs)
return result


Expand Down
65 changes: 15 additions & 50 deletions extruct/tool.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,30 @@
import argparse
import json

import lxml
import requests
from extruct.jsonld import JsonLdExtractor
from extruct.rdfa import RDFaExtractor
from extruct.w3cmicrodata import MicrodataExtractor
from extruct.xmldom import XmlDomHTMLParser

import extruct
from extruct import SYNTAXES

def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True):
def metadata_from_url(url, syntaxes=SYNTAXES):
resp = requests.get(url, timeout=30)
result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)}
try:
resp.raise_for_status()
except requests.exceptions.HTTPError:
return result

parser = XmlDomHTMLParser(encoding=resp.encoding)
tree = lxml.html.fromstring(resp.content, parser=parser)

if microdata:
mde = MicrodataExtractor(nested=True)
result['microdata'] = mde.extract_items(tree, resp.url)

if jsonld:
jsonlde = JsonLdExtractor()
result['json-ld'] = jsonlde.extract_items(tree, resp.url)

if rdfa:
rdfae = RDFaExtractor()
result['rdfa'] = rdfae.extract_items(tree, resp.url)

result.update(extruct.extract(resp.content, url=url, syntaxes=syntaxes))
return result


def main():
def main(args=None):
parser = argparse.ArgumentParser(prog='extruct', description=__doc__)
parser.add_argument('url', help='The target URL')
parser.add_argument(
'--microdata',
action='store_true',
default=False,
help='Extract W3C Microdata from the page.',
)
parser.add_argument(
'--jsonld',
action='store_true',
default=False,
help='Extract JSON-LD metadata from the page.',
)
parser.add_argument(
'--rdfa',
action='store_true',
default=False,
help='Extract RDFa metadata from the page.',
)
args = parser.parse_args()

if any((args.microdata, args.jsonld, args.rdfa)):
metadata = metadata_from_url(args.url, args.microdata, args.jsonld, args.rdfa)
else:
metadata = metadata_from_url(args.url)
arg = parser.add_argument
arg('url', help='The target URL')
arg('--syntaxes', nargs='+',
choices=SYNTAXES,
default=SYNTAXES,
help='List of syntaxes to extract. Valid values any or all (default):'
'microdata, opengraph, microformat json-ld, rdfa.'
'Example: --syntaxes microdata opengraph json-ld')
args = parser.parse_args(args)
metadata = metadata_from_url(args.url, args.syntaxes)
return json.dumps(metadata, indent=2, sort_keys=True)
2 changes: 1 addition & 1 deletion extruct/xmldom.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from xml.dom.minidom import Attr, NamedNodeMap

from lxml.etree import ElementBase, _ElementStringResult, _ElementUnicodeResult, XPath, tostring
from lxml.html import fromstring, HTMLParser, HtmlElementClassLookup
from lxml.html import HTMLParser, HtmlElementClassLookup


class DomElementUnicodeResult(object):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ gevent
requests
rdflib
rdflib-jsonld
mf2py
9 changes: 8 additions & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
# -*- coding: utf-8 -*-
import os
import json


tests_datadir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'samples')

def get_testdata(*paths):
"""Return test data"""
path = os.path.join(tests_datadir, *paths)
return open(path, 'rb').read()
with open(path, 'rb') as f_in:
return f_in.read()


def jsonize_dict(d):
return json.loads(json.dumps(d))
38 changes: 38 additions & 0 deletions tests/samples/misc/microformat_test.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="https://www.w3.org/1999/xhtml" xmlns:og="https://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml">
<head>
<title>Himanshu's Open Graph Protocol</title>
<meta http-equiv="Content-Type" content="text/html;charset=WINDOWS-1252" />
<meta http-equiv="Content-Language" content="en-us" />
<link rel="stylesheet" type="text/css" href="event-education.css" />
<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" >
<meta property="og:title" content="Himanshu's Open Graph Protocol"/>
<article class="h-entry">
<h1 class="p-name">Microformats are amazing</h1>
<p>Published by <a class="p-author h-card" href="http://example.com">W. Developer</a>
on <time class="dt-published" datetime="2013-06-13 12:00:00">13<sup>th</sup> June 2013</time></p>

<p class="p-summary">In which I extoll the virtues of using microformats.</p>

<div class="e-content">
<p>Blah blah blah</p>
</div>
</article>

</head>

<body>

<div id="fb-root"></div>
<script>(function(d, s, id) {
var js, fjs = d.getElementsByTagName(s)[0];
if (d.getElementById(id)) return;
js = d.createElement(s); js.id = id;
js.src = "//connect.facebook.net/en_US/all.js#xfbml=1&appId=501839739845103";
fjs.parentNode.insertBefore(js, fjs);
}(document, 'script', 'facebook-jssdk'));</script>
.
.
.
</body>
</html>
40 changes: 40 additions & 0 deletions tests/samples/misc/microformat_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
[
{
"type": [
"h-entry"
],
"properties": {
"name": [
"Microformats are amazing"
],
"author": [
{
"type": [
"h-card"
],
"properties": {
"name": [
"W. Developer"
],
"url": [
"http://example.com"
]
},
"value": "W. Developer"
}
],
"published": [
"2013-06-13 12:00:00"
],
"summary": [
"In which I extoll the virtues of using microformats."
],
"content": [
{
"html": "\n<p>Blah blah blah</p>\n",
"value": "\nBlah blah blah\n"
}
]
}
}
]
33 changes: 33 additions & 0 deletions tests/samples/misc/opengraph_test.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="https://www.w3.org/1999/xhtml" xmlns:og="https://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml">
<head>
<title>Himanshu's Open Graph Protocol</title>
<meta http-equiv="Content-Type" content="text/html;charset=WINDOWS-1252" />
<meta http-equiv="Content-Language" content="en-us" />
<link rel="stylesheet" type="text/css" href="event-education.css" />
<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" >
<meta property="og:title" content="Himanshu's Open Graph Protocol"/>
<meta property="og:type" content="article"/>
<meta property="og:url" content="https://www.eventeducation.com/test.php"/>
<meta property="og:image" content="https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"/>
<meta property="fb:admins" content="himanshu160"/>
<meta property="og:site_name" content="Event Education"/>
<meta property="og:description" content="Event Education provides free courses on event planning and management to event professionals worldwide."/>

</head>

<body>

<div id="fb-root"></div>
<script>(function(d, s, id) {
var js, fjs = d.getElementsByTagName(s)[0];
if (d.getElementById(id)) return;
js = d.createElement(s); js.id = id;
js.src = "//connect.facebook.net/en_US/all.js#xfbml=1&appId=501839739845103";
fjs.parentNode.insertBefore(js, fjs);
}(document, 'script', 'facebook-jssdk'));</script>
.
.
.
</body>
</html>
33 changes: 33 additions & 0 deletions tests/samples/misc/opengraph_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[
{
"namespace": {
"og": "http://ogp.me/ns#"
},
"properties": [
[
"og:title",
"Himanshu's Open Graph Protocol"
],
[
"og:type",
"article"
],
[
"og:url",
"https://www.eventeducation.com/test.php"
],
[
"og:image",
"https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"
],
[
"og:site_name",
"Event Education"
],
[
"og:description",
"Event Education provides free courses on event planning and management to event professionals worldwide."
]
]
}
]

0 comments on commit 9e86435

Please sign in to comment.