Skip to content

Commit

Permalink
Add an option to uniform opengraph, microdata and microformat to json-ld
Browse files Browse the repository at this point in the history
* Add uniform and tests for opengraph, microdata and microformat

* from six import urljoin urlparse

* Add six and mf2py to requirements

* Add tests for uniform.py

* add files for tests

* fix test

* fix uniform, update notes, add uniform to tool

* fix tests

* Fix the test fix

* update readme

* update readme

* update readme

* update readme

* update readme

* update readme

* update readme

* add new arguments in tools, remove FIXME in uniform, fix readme
  • Loading branch information
Kebniss authored and kmike committed Apr 9, 2018
1 parent 54ef8e7 commit becf22e
Show file tree
Hide file tree
Showing 13 changed files with 502 additions and 89 deletions.
139 changes: 57 additions & 82 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,91 +162,15 @@ First fetch the HTML using python-requests and then feed the response body to ``
'http://ogp.me/ns#url': [ { '@value': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'}],
'https://api.w.org/': [ { '@id': 'https://www.optimizesmart.com/wp-json/'}]}]}


Another example with a page from SongKick containing RDFa, JSON-LD and Open Graph metadata::
Select syntaxes
+++++++++++++++
It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned::

>>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
>>> data = extruct.extract(r.text, r.url)
>>> data = extruct.extract(r.text, r.url, syntaxes=['microdata', 'opengraph', 'rdfa'])
>>>
>>> pp.pprint(data)
{ 'json-ld': [ { '@context': 'http://schema.org',
'@type': 'MusicEvent',
'location': { '@type': 'Place',
'address': { '@type': 'PostalAddress',
'addressCountry': 'US',
'addressLocality': 'Brooklyn',
'addressRegion': 'NY',
'postalCode': '11225',
'streetAddress': '497 Rogers Ave'},
'geo': { '@type': 'GeoCoordinates',
'latitude': 40.660109,
'longitude': -73.953193},
'name': 'The Owl Music Parlor',
'sameAs': 'http://www.theowl.nyc'},
'name': 'Elysian Fields',
'performer': [ { '@type': 'MusicGroup',
'name': 'Elysian Fields',
'sameAs': 'https://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic&utm_source=microformat'}],
'startDate': '2017-06-10T19:30:00-0400',
'url': 'https://www.songkick.com/concerts/30173984-elysian-fields-at-owl-music-parlor?utm_medium=organic&utm_source=microformat'},
{ '@context': 'http://schema.org',
'@type': 'MusicEvent',
'location': { '@type': 'Place',
'address': { '@type': 'PostalAddress',
'addressCountry': 'US',
'addressLocality': 'San Francisco',
'addressRegion': 'CA',
'postalCode': '94107',
'streetAddress': '500 Fourth '
'Street'},
'geo': { '@type': 'GeoCoordinates',
'latitude': 37.7795638,
'longitude': -122.398023},
'name': 'Hotel Utah Saloon',
'sameAs': 'http://www.hotelutah.com/'},
'name': 'Elysian Fields',
'performer': [ { '@type': 'MusicGroup',
'name': 'Elysian Fields',
'sameAs': 'https://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic&utm_source=microformat'},
{ '@type': 'MusicGroup',
'name': 'Chocolate Genius Inc.',
'sameAs': 'https://www.songkick.com/artists/1009602-chocolate-genius-inc?utm_medium=organic&utm_source=microformat'}],
'startDate': '2017-04-26T20:00:00-0700',
'url': 'https://www.songkick.com/concerts/29673614-elysian-fields-at-hotel-utah-saloon?utm_medium=organic&utm_source=microformat'},
{ '@context': 'http://schema.org',
'@type': 'MusicEvent',
'location': { '@type': 'Place',
'address': { '@type': 'PostalAddress',
'addressCountry': 'France',
'addressLocality': 'Saint-Nazaire',
'postalCode': '44600',
'streetAddress': 'Alvéole 14 de la '
'base sous-Marine '
'Bd de la Légion '
'd’Honneur'},
'geo': { '@type': 'GeoCoordinates',
'latitude': 47.2755434,
'longitude': -2.2022817},
'name': 'VIP',
'sameAs': 'http://www.levip-saintnazaire.com/'},
'name': 'Elysian Fields',
'performer': [ { '@type': 'MusicGroup',
'name': 'Elysian Fields',
'sameAs': 'https://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic&utm_source=microformat'},
{ '@type': 'MusicGroup',
'name': 'Troy Von Balthazar',
'sameAs': 'https://www.songkick.com/artists/355304-troy-von-balthazar?utm_medium=organic&utm_source=microformat'}],
'startDate': '2016-10-29T21:00:00+0200',
'url': 'https://www.songkick.com/concerts/27626524-elysian-fields-at-vip?utm_medium=organic&utm_source=microformat'},
{ '@context': 'http://schema.org',
'@type': 'MusicGroup',
'image': 'https://images.sk-static.com/images/media/profile_images/artists/236156/card_avatar',
'interactionCount': '6100 UserLikes',
'logo': 'https://images.sk-static.com/images/media/profile_images/artists/236156/card_avatar',
'name': 'Elysian Fields',
'url': 'https://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic&utm_source=microformat'}],
'microdata': [],
'microformat': [],
{ 'microdata': [],
'opengraph': [ { 'namespace': { 'concerts': 'http://ogp.me/ns/fb/songkick-concerts#',
'fb': 'http://www.facebook.com/2008/fbml',
'og': 'http://ogp.me/ns#'},
Expand Down Expand Up @@ -281,6 +205,57 @@ Another example with a page from SongKick containing RDFa, JSON-LD and Open Grap
'http://www.facebook.com/2008/fbmlapp_id': [ { '@value': '308540029359'}]}]}


Uniform
+++++++
Another option is to uniform the output of microformat, opengraph, microdata and json-ld syntaxes to the following structure: ::

{'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}

To do so set ``uniform=True`` when calling ``extract``, it's false by default for backward compatibility. Here the same example as before but with uniform set to True: ::

>>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
>>> data = extruct.extract(r.text, r.url, syntaxes=['microdata', 'opengraph', 'rdfa'], uniform=True)
>>>
>>> pp.pprint(data)
{ 'microdata': [],
'opengraph': [ { '@context': { 'concerts': 'http://ogp.me/ns/fb/songkick-concerts#',
'fb': 'http://www.facebook.com/2008/fbml',
'og': 'http://ogp.me/ns#'},
'@type': 'songkick-concerts:artist',
'fb:app_id': '308540029359',
'og:description': 'Find out when Elysian Fields is next '
'playing live near you. List of all '
'Elysian Fields tour dates and concerts.',
'og:image': 'http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg',
'og:site_name': 'Songkick',
'og:title': 'Elysian Fields',
'og:url': 'https://www.songkick.com/artists/236156-elysian-fields'}],
'rdfa': [ { '@id': 'https://www.songkick.com/artists/236156-elysian-fields',
'al:ios:app_name': [{'@value': 'Songkick Concerts'}],
'al:ios:app_store_id': [{'@value': '438690886'}],
'al:ios:url': [ { '@value': 'songkick://artists/236156-elysian-fields'}],
'http://ogp.me/ns#description': [ { '@value': 'Find out when '
'Elysian Fields is '
'next playing live '
'near you. List of '
'all Elysian '
'Fields tour dates '
'and concerts.'}],
'http://ogp.me/ns#image': [ { '@value': 'http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg'}],
'http://ogp.me/ns#site_name': [{'@value': 'Songkick'}],
'http://ogp.me/ns#title': [{'@value': 'Elysian Fields'}],
'http://ogp.me/ns#type': [{'@value': 'songkick-concerts:artist'}],
'http://ogp.me/ns#url': [ { '@value': 'https://www.songkick.com/artists/236156-elysian-fields'}],
'http://www.facebook.com/2008/fbmlapp_id': [ { '@value': '308540029359'}]}]}

NB rdfa structure is not uniformed yet

Single extractors
-----------------

You can also use each extractor individually. See below.

Microdata extraction
Expand Down Expand Up @@ -703,7 +678,7 @@ Supported Parameters
By default, the command line tool will try to extract all the supported
metadata formats from the page (currently Microdata, JSON-LD, RDFa, Open Graph
and Microformat). If you want to restrict the output to just one or a subset of
those, you can pass their individual names through 'syntaxes' argument.
those, you can pass their individual names collected in a list through 'syntaxes' argument.

For example, this command extracts only Microdata and JSON-LD metadata from
"http://example.com"::
Expand Down
24 changes: 22 additions & 2 deletions extruct/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,29 @@
from extruct.opengraph import OpenGraphExtractor
from extruct.microformat import MicroformatExtractor
from extruct.xmldom import XmlDomHTMLParser
from extruct.uniform import _umicrodata_microformat, _uopengraph

logger = logging.getLogger(__name__)
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']

def extract(htmlstring, url=None, encoding="UTF-8",
syntaxes=SYNTAXES,
errors='strict'):
errors='strict',
uniform=False,
schema_context='http://schema.org'):
"""htmlstring: string with valid html document;
url: url of the html documents
encoding: encoding of the html document
syntaxes: list of syntaxes to extract, default SYNTAXES
errors: set to 'log' to save exceptions to file, 'ignore' to ignore them
or 'strict'(default) to raise them"""
or 'strict'(default) to raise them
uniform: if True uniform output format of all syntaxes to a list of dicts.
Returned dicts structure:
{'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}
schema_context: schema's context for current page"""
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
Expand Down Expand Up @@ -51,4 +61,14 @@ def extract(htmlstring, url=None, encoding="UTF-8",
pass
if errors == 'strict':
raise

if uniform:
if 'microdata' in syntaxes:
output['microdata'] = _umicrodata_microformat(output['microdata'],
schema_context=schema_context)
if 'microformat' in syntaxes:
output['microformat'] = _umicrodata_microformat(output['microformat'],
schema_context='http://microformats.org/wiki/')
if 'opengraph' in syntaxes:
output['opengraph'] = _uopengraph(output['opengraph'])
return output
19 changes: 16 additions & 3 deletions extruct/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@
import extruct
from extruct import SYNTAXES

def metadata_from_url(url, syntaxes=SYNTAXES):
def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False, schema_context='http://schema.org'):
resp = requests.get(url, timeout=30)
result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)}
try:
resp.raise_for_status()
except requests.exceptions.HTTPError:
return result
result.update(extruct.extract(resp.content, url=url, syntaxes=syntaxes))
result.update(extruct.extract(resp.content,
url=url,
syntaxes=syntaxes,
uniform=uniform,
schema_context=schema_context))
return result


Expand All @@ -25,6 +29,15 @@ def main(args=None):
help='List of syntaxes to extract. Valid values any or all (default):'
'microdata, opengraph, microformat json-ld, rdfa.'
'Example: --syntaxes microdata opengraph json-ld')
arg('--uniform', default=False,
help='''If True uniform output format of all syntaxes to a list of dicts.
Returned dicts structure:
{'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}''')
arg('--schema_context', default='http://schema.org',
help="schema's context for current page")
args = parser.parse_args(args)
metadata = metadata_from_url(args.url, args.syntaxes)
metadata = metadata_from_url(args.url, args.syntaxes, args.uniform, args.schema_context)
return json.dumps(metadata, indent=2, sort_keys=True)
67 changes: 67 additions & 0 deletions extruct/uniform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from six.moves.urllib.parse import urlparse, urljoin


def _uopengraph(extracted):
out = []
for obj in extracted:
flattened = dict(obj['properties'])
t = flattened.pop('og:type', None)
if t:
flattened['@type'] = t
flattened['@context'] = obj['namespace']
out.append(flattened)
return out


def _umicrodata_microformat(extracted, schema_context):
res = []
if isinstance(extracted, list):
for obj in extracted:
res.append(flatten_dict(obj, schema_context, True))
elif isinstance(extracted, dict):
res.append(flatten_dict(extracted, schema_context, False))

return res


def flatten_dict(d, schema_context, add_context):
out = dict(d)
typ = out.pop('type', None)
if not typ:
return d

if isinstance(typ, list):
out['@type'] = typ
context = schema_context
else:
context, typ = infer_context(typ, schema_context)
out['@type'] = typ

if add_context:
out['@context'] = context

props = out.pop('properties', {})
for field, value in props.items():
if isinstance(value, dict):
value = flatten_dict(value, schema_context, False)
elif isinstance(value, list):
value = [
flatten_dict(o, schema_context, False)
if isinstance(o, dict) else o
for o in value
]
out[field] = value
return out


def infer_context(typ, context='http://schema.org'):
parsed_context = urlparse(typ)
if parsed_context.netloc:
base = ''.join([parsed_context.scheme, '://', parsed_context.netloc])
if parsed_context.path and parsed_context.fragment:
context = urljoin(base, parsed_context.path)
typ = parsed_context.fragment.strip('/')
elif parsed_context.path:
context = base
typ = parsed_context.path.strip('/')
return context, typ
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ requests
rdflib
rdflib-jsonld
mf2py
w3lib
six
w3lib
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def get_version():
'rdflib',
'rdflib-jsonld',
'mf2py',
'w3lib'],
'w3lib',
'six'],
extras_require={
'service': [
'bottle',
Expand Down
33 changes: 33 additions & 0 deletions tests/samples/misc/microformat_flat_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[
{
"@type": ["h-entry"],
"@context": "http://microformats.org/wiki/",
"name": [
"Microformats are amazing"
],
"author": [
{
"@type": ["h-card"],
"name": [
"W. Developer"
],
"url": [
"http://example.com"
],
"value": "W. Developer"
}
],
"published": [
"2013-06-13 12:00:00"
],
"summary": [
"In which I extoll the virtues of using microformats."
],
"content": [
{
"html": "\n<p>Blah blah blah</p>\n",
"value": "\nBlah blah blah\n"
}
]
}
]

0 comments on commit becf22e

Please sign in to comment.