Add an option to uniform opengraph, microdata and microformat to json-ld

* Add uniform and tests for opengraph, microdata and microformat * from six import urljoin urlparse * Add six and mf2py to requirements * Add tests for uniform.py * add files for tests * fix test * fix uniform, update notes, add uniform to tool * fix tests * Fix the test fix * update readme * update readme * update readme * update readme * update readme * update readme * update readme * add new arguments in tools, remove FIXME in uniform, fix readme
scrapinghub · Apr 9, 2018 · becf22e · becf22e
1 parent 54ef8e7
commit becf22e
Show file tree

Hide file tree

Showing 13 changed files with 502 additions and 89 deletions.
diff --git a/README.rst b/README.rst
@@ -162,91 +162,15 @@ First fetch the HTML using python-requests and then feed the response body to ``
                 'http://ogp.me/ns#url': [ { '@value': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'}],
                 'https://api.w.org/': [ { '@id': 'https://www.optimizesmart.com/wp-json/'}]}]}
 
-
-Another example with a page from SongKick containing RDFa, JSON-LD and Open Graph metadata::
+Select syntaxes
++++++++++++++++
+It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned::
 
   >>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
-  >>> data = extruct.extract(r.text, r.url)
+  >>> data = extruct.extract(r.text, r.url, syntaxes=['microdata', 'opengraph', 'rdfa'])
   >>>
   >>> pp.pprint(data)
-  { 'json-ld': [ { '@context': 'http://schema.org',
-                   '@type': 'MusicEvent',
-                   'location': { '@type': 'Place',
-                                 'address': { '@type': 'PostalAddress',
-                                              'addressCountry': 'US',
-                                              'addressLocality': 'Brooklyn',
-                                              'addressRegion': 'NY',
-                                              'postalCode': '11225',
-                                              'streetAddress': '497 Rogers Ave'},
-                                 'geo': { '@type': 'GeoCoordinates',
-                                          'latitude': 40.660109,
-                                          'longitude': -73.953193},
-                                 'name': 'The Owl Music Parlor',
-                                 'sameAs': 'http://www.theowl.nyc'},
-                   'name': 'Elysian Fields',
-                   'performer': [ { '@type': 'MusicGroup',
-                                    'name': 'Elysian Fields',
-                                    'sameAs': 'https://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic&utm_source=microformat'}],
-                   'startDate': '2017-06-10T19:30:00-0400',
-                   'url': 'https://www.songkick.com/concerts/30173984-elysian-fields-at-owl-music-parlor?utm_medium=organic&utm_source=microformat'},
-                 { '@context': 'http://schema.org',
-                   '@type': 'MusicEvent',
-                   'location': { '@type': 'Place',
-                                 'address': { '@type': 'PostalAddress',
-                                              'addressCountry': 'US',
-                                              'addressLocality': 'San Francisco',
-                                              'addressRegion': 'CA',
-                                              'postalCode': '94107',
-                                              'streetAddress': '500 Fourth '
-                                                               'Street'},
-                                 'geo': { '@type': 'GeoCoordinates',
-                                          'latitude': 37.7795638,
-                                          'longitude': -122.398023},
-                                 'name': 'Hotel Utah Saloon',
-                                 'sameAs': 'http://www.hotelutah.com/'},
-                   'name': 'Elysian Fields',
-                   'performer': [ { '@type': 'MusicGroup',
-                                    'name': 'Elysian Fields',
-                                    'sameAs': 'https://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic&utm_source=microformat'},
-                                  { '@type': 'MusicGroup',
-                                    'name': 'Chocolate Genius Inc.',
-                                    'sameAs': 'https://www.songkick.com/artists/1009602-chocolate-genius-inc?utm_medium=organic&utm_source=microformat'}],
-                   'startDate': '2017-04-26T20:00:00-0700',
-                   'url': 'https://www.songkick.com/concerts/29673614-elysian-fields-at-hotel-utah-saloon?utm_medium=organic&utm_source=microformat'},
-                 { '@context': 'http://schema.org',
-                   '@type': 'MusicEvent',
-                   'location': { '@type': 'Place',
-                                 'address': { '@type': 'PostalAddress',
-                                              'addressCountry': 'France',
-                                              'addressLocality': 'Saint-Nazaire',
-                                              'postalCode': '44600',
-                                              'streetAddress': 'Alvéole 14 de la '
-                                                               'base sous-Marine '
-                                                               'Bd de la Légion '
-                                                               'd’Honneur'},
-                                 'geo': { '@type': 'GeoCoordinates',
-                                          'latitude': 47.2755434,
-                                          'longitude': -2.2022817},
-                                 'name': 'VIP',
-                                 'sameAs': 'http://www.levip-saintnazaire.com/'},
-                   'name': 'Elysian Fields',
-                   'performer': [ { '@type': 'MusicGroup',
-                                    'name': 'Elysian Fields',
-                                    'sameAs': 'https://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic&utm_source=microformat'},
-                                  { '@type': 'MusicGroup',
-                                    'name': 'Troy Von Balthazar',
-                                    'sameAs': 'https://www.songkick.com/artists/355304-troy-von-balthazar?utm_medium=organic&utm_source=microformat'}],
-                   'startDate': '2016-10-29T21:00:00+0200',
-                   'url': 'https://www.songkick.com/concerts/27626524-elysian-fields-at-vip?utm_medium=organic&utm_source=microformat'},
-                 { '@context': 'http://schema.org',
-                   '@type': 'MusicGroup',
-                   'image': 'https://images.sk-static.com/images/media/profile_images/artists/236156/card_avatar',
-                   'interactionCount': '6100 UserLikes',
-                   'logo': 'https://images.sk-static.com/images/media/profile_images/artists/236156/card_avatar',
-                   'name': 'Elysian Fields',
-                   'url': 'https://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic&utm_source=microformat'}],
-    'microdata': [],
-    'microformat': [],
+  { 'microdata': [],
     'opengraph': [ { 'namespace': { 'concerts': 'http://ogp.me/ns/fb/songkick-concerts#',
                                     'fb': 'http://www.facebook.com/2008/fbml',
                                     'og': 'http://ogp.me/ns#'},
@@ -281,6 +205,57 @@ Another example with a page from SongKick containing RDFa, JSON-LD and Open Grap
                 'http://www.facebook.com/2008/fbmlapp_id': [ { '@value': '308540029359'}]}]}
 
 
+Uniform
++++++++
+Another option is to uniform the output of microformat, opengraph, microdata and json-ld syntaxes to the following structure: ::
+
+    {'@context': 'http://example.com', 
+                 '@type': 'example_type',
+                 /* All other the properties in keys here */
+                 }
+
+To do so set ``uniform=True`` when calling ``extract``, it's false by default for backward compatibility. Here the same example as before but with uniform set to True: ::
+
+  >>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
+  >>> data = extruct.extract(r.text, r.url, syntaxes=['microdata', 'opengraph', 'rdfa'], uniform=True)
+  >>>
+  >>> pp.pprint(data)
+  { 'microdata': [],
+    'opengraph': [ { '@context': { 'concerts': 'http://ogp.me/ns/fb/songkick-concerts#',
+                                 'fb': 'http://www.facebook.com/2008/fbml',
+                                 'og': 'http://ogp.me/ns#'},
+                   '@type': 'songkick-concerts:artist',
+                   'fb:app_id': '308540029359',
+                   'og:description': 'Find out when Elysian Fields is next '
+                                     'playing live near you. List of all '
+                                     'Elysian Fields tour dates and concerts.',
+                   'og:image': 'http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg',
+                   'og:site_name': 'Songkick',
+                   'og:title': 'Elysian Fields',
+                   'og:url': 'https://www.songkick.com/artists/236156-elysian-fields'}],
+    'rdfa': [ { '@id': 'https://www.songkick.com/artists/236156-elysian-fields',
+                'al:ios:app_name': [{'@value': 'Songkick Concerts'}],
+                'al:ios:app_store_id': [{'@value': '438690886'}],
+                'al:ios:url': [ { '@value': 'songkick://artists/236156-elysian-fields'}],
+                'http://ogp.me/ns#description': [ { '@value': 'Find out when '
+                                                              'Elysian Fields is '
+                                                              'next playing live '
+                                                              'near you. List of '
+                                                              'all Elysian '
+                                                              'Fields tour dates '
+                                                              'and concerts.'}],
+                'http://ogp.me/ns#image': [ { '@value': 'http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg'}],
+                'http://ogp.me/ns#site_name': [{'@value': 'Songkick'}],
+                'http://ogp.me/ns#title': [{'@value': 'Elysian Fields'}],
+                'http://ogp.me/ns#type': [{'@value': 'songkick-concerts:artist'}],
+                'http://ogp.me/ns#url': [ { '@value': 'https://www.songkick.com/artists/236156-elysian-fields'}],
+                'http://www.facebook.com/2008/fbmlapp_id': [ { '@value': '308540029359'}]}]}
+
+NB rdfa structure is not uniformed yet
+
+Single extractors
+-----------------
+
 You can also use each extractor individually. See below.
 
 Microdata extraction
@@ -703,7 +678,7 @@ Supported Parameters
 By default, the command line tool will try to extract all the supported
 metadata formats from the page (currently Microdata, JSON-LD, RDFa, Open Graph
 and Microformat). If you want to restrict the output to just one or a subset of
-those, you can pass their individual names through 'syntaxes' argument.
+those, you can pass their individual names collected in a list through 'syntaxes' argument.
 
 For example, this command extracts only Microdata and JSON-LD metadata from
 "http://example.com"::

diff --git a/extruct/__init__.py b/extruct/__init__.py
@@ -6,19 +6,29 @@
 from extruct.opengraph import OpenGraphExtractor
 from extruct.microformat import MicroformatExtractor
 from extruct.xmldom import XmlDomHTMLParser
+from extruct.uniform import _umicrodata_microformat, _uopengraph
 
 logger = logging.getLogger(__name__)
 SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
 
 def extract(htmlstring, url=None, encoding="UTF-8",
             syntaxes=SYNTAXES,
-            errors='strict'):
+            errors='strict',
+            uniform=False,
+            schema_context='http://schema.org'):
     """htmlstring: string with valid html document;
        url: url of the html documents
        encoding: encoding of the html document
        syntaxes: list of syntaxes to extract, default SYNTAXES
        errors: set to 'log' to save exceptions to file, 'ignore' to ignore them
-               or 'strict'(default) to raise them"""
+               or 'strict'(default) to raise them
+       uniform: if True uniform output format of all syntaxes to a list of dicts.
+                Returned dicts structure:
+                {'@context': 'http://example.com', 
+                 '@type': 'example_type',
+                 /* All other the properties in keys here */
+                 }
+       schema_context: schema's context for current page"""
     if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
         raise ValueError("syntaxes must be a list with any or all (default) of"
                          "these values: {}".format(SYNTAXES))
@@ -51,4 +61,14 @@ def extract(htmlstring, url=None, encoding="UTF-8",
                 pass
             if errors == 'strict':
                 raise
+
+    if uniform:
+        if 'microdata' in syntaxes:
+            output['microdata'] = _umicrodata_microformat(output['microdata'],
+                                                        schema_context=schema_context)
+        if 'microformat' in syntaxes:
+            output['microformat'] = _umicrodata_microformat(output['microformat'],
+                                                          schema_context='http://microformats.org/wiki/')
+        if 'opengraph' in syntaxes:
+            output['opengraph'] = _uopengraph(output['opengraph'])
     return output
diff --git a/extruct/tool.py b/extruct/tool.py
@@ -4,14 +4,18 @@
 import extruct
 from extruct import SYNTAXES
 
-def metadata_from_url(url, syntaxes=SYNTAXES):
+def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False, schema_context='http://schema.org'):
     resp = requests.get(url, timeout=30)
     result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)}
     try:
         resp.raise_for_status()
     except requests.exceptions.HTTPError:
         return result
-    result.update(extruct.extract(resp.content, url=url, syntaxes=syntaxes))
+    result.update(extruct.extract(resp.content, 
+                                  url=url, 
+                                  syntaxes=syntaxes, 
+                                  uniform=uniform,
+                                  schema_context=schema_context))
     return result
 
 
@@ -25,6 +29,15 @@ def main(args=None):
         help='List of syntaxes to extract. Valid values any or all (default):'
              'microdata, opengraph, microformat json-ld, rdfa.'
              'Example: --syntaxes microdata opengraph json-ld')
+    arg('--uniform', default=False, 
+        help='''If True uniform output format of all syntaxes to a list of dicts.
+                Returned dicts structure:
+                {'@context': 'http://example.com', 
+                 '@type': 'example_type',
+                 /* All other the properties in keys here */
+                 }''')
+    arg('--schema_context', default='http://schema.org', 
+        help="schema's context for current page")
     args = parser.parse_args(args)
-    metadata = metadata_from_url(args.url, args.syntaxes)
+    metadata = metadata_from_url(args.url, args.syntaxes, args.uniform, args.schema_context)
     return json.dumps(metadata, indent=2, sort_keys=True)
diff --git a/extruct/uniform.py b/extruct/uniform.py
@@ -0,0 +1,67 @@
+from six.moves.urllib.parse import urlparse, urljoin
+
+
+def _uopengraph(extracted):
+    out = []
+    for obj in extracted:
+        flattened = dict(obj['properties'])
+        t = flattened.pop('og:type', None)
+        if t:
+            flattened['@type'] = t
+        flattened['@context'] = obj['namespace']
+        out.append(flattened)
+    return out
+
+
+def _umicrodata_microformat(extracted, schema_context):
+    res = []
+    if isinstance(extracted, list):
+        for obj in extracted:
+            res.append(flatten_dict(obj, schema_context, True))
+    elif isinstance(extracted, dict):
+        res.append(flatten_dict(extracted, schema_context, False))
+
+    return res
+
+
+def flatten_dict(d, schema_context, add_context):
+    out = dict(d)
+    typ = out.pop('type', None)
+    if not typ:
+        return d
+
+    if isinstance(typ, list):
+        out['@type'] = typ
+        context = schema_context
+    else:
+        context, typ = infer_context(typ, schema_context)
+        out['@type'] = typ
+
+    if add_context:
+        out['@context'] = context
+
+    props = out.pop('properties', {})
+    for field, value in props.items():
+        if isinstance(value, dict):
+            value = flatten_dict(value, schema_context, False)
+        elif isinstance(value, list):
+            value = [
+                flatten_dict(o, schema_context, False)
+                if isinstance(o, dict) else o
+                for o in value
+            ]
+        out[field] = value
+    return out
+
+
+def infer_context(typ, context='http://schema.org'):
+    parsed_context = urlparse(typ)
+    if parsed_context.netloc:
+        base = ''.join([parsed_context.scheme, '://', parsed_context.netloc])
+        if parsed_context.path and parsed_context.fragment:
+            context = urljoin(base, parsed_context.path)
+            typ = parsed_context.fragment.strip('/')
+        elif parsed_context.path:
+            context = base
+            typ = parsed_context.path.strip('/')
+    return context, typ
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,5 @@ requests
 rdflib
 rdflib-jsonld
 mf2py
-w3lib
+six
+w3lib
diff --git a/setup.py b/setup.py
@@ -32,7 +32,8 @@ def get_version():
                       'rdflib', 
                       'rdflib-jsonld', 
                       'mf2py', 
-                      'w3lib'],
+                      'w3lib',
+                      'six'],
     extras_require={
         'service': [
             'bottle',

diff --git a/tests/samples/misc/microformat_flat_test.json b/tests/samples/misc/microformat_flat_test.json
@@ -0,0 +1,33 @@
+[
+    {
+        "@type": ["h-entry"],
+        "@context": "http://microformats.org/wiki/",
+        "name": [
+            "Microformats are amazing"
+        ],
+        "author": [
+            {
+                "@type": ["h-card"],
+                "name": [
+                    "W. Developer"
+                ],
+                "url": [
+                    "http://example.com"
+                ],
+                "value": "W. Developer"
+            }
+        ],
+        "published": [
+            "2013-06-13 12:00:00"
+        ],
+        "summary": [
+            "In which I extoll the virtues of using microformats."
+        ],
+        "content": [
+            {
+                "html": "\n<p>Blah blah blah</p>\n",
+                "value": "\nBlah blah blah\n"
+            }
+            ]
+        }
+]