Merge pull request #61 from scrapinghub/add_syntaxes

Add og and microformat extraction
scrapinghub · Mar 28, 2018 · 9e86435 · 9e86435
2 parents 76ad2de + f12733b
commit 9e86435
Show file tree

Hide file tree

Showing 23 changed files with 1,624 additions and 1,130 deletions.
diff --git a/README.rst b/README.rst
diff --git a/extruct/__init__.py b/extruct/__init__.py
@@ -1,16 +1,54 @@
+import logging
 from lxml.html import fromstring
-
 from extruct.jsonld import JsonLdExtractor
 from extruct.rdfa import RDFaExtractor
 from extruct.w3cmicrodata import MicrodataExtractor
+from extruct.opengraph import OpenGraphExtractor
+from extruct.microformat import MicroformatExtractor
 from extruct.xmldom import XmlDomHTMLParser
 
+logger = logging.getLogger(__name__)
+SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
 
-def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"):
+def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8",
+            syntaxes=SYNTAXES,
+            errors='strict'):
+    """htmlstring: string with valid html document;
+       url: url of the html documents
+       encoding: encoding of the html document
+       syntaxes: list of syntaxes to extract, default SYNTAXES
+       errors: set to 'log' to save exceptions to file, 'ignore' to ignore them
+               or 'strict'(default) to raise them"""
+    if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
+        raise ValueError("syntaxes must be a list with any or all (default) of"
+                         "these values: {}".format(SYNTAXES))
+    if errors not in ['log', 'ignore', 'strict']:
+        raise ValueError('Invalid error command, valid values are either "log"'
+                         ', "ignore" or "strict"')
     domparser = XmlDomHTMLParser(encoding=encoding)
     tree = fromstring(htmlstring, parser=domparser)
-    return {name: extractor.extract_items(tree, url=url)
-            for name, extractor in (
-                ('json-ld', JsonLdExtractor()),
-                ('microdata', MicrodataExtractor()),
-                ('rdfa', RDFaExtractor()))}
+    processors = []
+    if 'microdata' in syntaxes:
+        processors.append(('microdata', MicrodataExtractor().extract_items))
+    if 'json-ld' in syntaxes:
+        processors.append(('json-ld', JsonLdExtractor().extract_items))
+    if 'opengraph' in syntaxes:
+        processors.append(('opengraph', OpenGraphExtractor().extract_items))
+    if 'microformat' in syntaxes:
+        processors.append(('microformat', MicroformatExtractor().extract_items))
+    if 'rdfa' in syntaxes:
+        processors.append(('rdfa', RDFaExtractor().extract_items))
+    output = {}
+    for label, extract in processors:
+        try:
+            output[label] = [obj for obj in extract(document=tree,
+                                                    url=url,
+                                                    html=htmlstring)]
+        except Exception as e:
+            if errors == 'log':
+                logger.exception("Failed to parse %s", url)
+            if errors == 'ignore':
+                pass
+            if errors == 'strict':
+                raise e
+    return output
diff --git a/extruct/microformat.py b/extruct/microformat.py
@@ -0,0 +1,10 @@
+import mf2py
+
+class MicroformatExtractor(object):
+
+    def extract(self, htmlstring, url='http://www.example.com/', encoding='UTF-8'):
+        return list(self.extract_items(htmlstring, url=url))
+
+    def extract_items(self, html, url, document=None):
+        for obj in mf2py.parse(html, html_parser="lxml", url=url)['items']:
+            yield obj
diff --git a/extruct/opengraph.py b/extruct/opengraph.py
@@ -0,0 +1,27 @@
+import re
+import lxml.html
+
+
+class OpenGraphExtractor(object):
+    """OpenGraph extractor following extruct API."""
+
+    def extract(self, htmlstring, url='http://www.example.com/', encoding='UTF-8'):
+        parser = lxml.html.HTMLParser(encoding=encoding)
+        doc = lxml.html.fromstring(htmlstring, parser=parser)
+        return list(self.extract_items(doc))
+
+    def extract_items(self, document, *args, **kwargs):
+        # OpenGraph defines a web page as a single rich object.
+        # TODO: Handle known opengraph namespaces.
+        for head in document.xpath('//head'):
+            prefix = dict(re.findall(r'\s*(\w+): ([^\s]+)', head.attrib.get('prefix', '')))
+            prefix.setdefault('og', 'http://ogp.me/ns#')
+            props = []
+            for el in head.xpath('meta[@property and @content]'):
+                prop = el.attrib['property']
+                val = el.attrib['content']
+                ns = prop.partition(':')[0]
+                if ns in prefix:
+                    props.append((prop, val))
+            if props:
+                yield {'namespace': prefix, 'properties': props}
diff --git a/extruct/service.py b/extruct/service.py
@@ -20,9 +20,9 @@ def _decorated(*args, **kwargs):
     return _decorated
 
 
-def async_extruct(url, microdata=True, jsonld=True, rdfa=True):
+def async_extruct(url, **kwargs):
     response.content_type = 'application/json'
-    result = metadata_from_url(url, microdata, jsonld, rdfa)
+    result = metadata_from_url(url, **kwargs)
     return result
 
 

diff --git a/extruct/tool.py b/extruct/tool.py
@@ -1,65 +1,30 @@
 import argparse
 import json
-
-import lxml
 import requests
-from extruct.jsonld import JsonLdExtractor
-from extruct.rdfa import RDFaExtractor
-from extruct.w3cmicrodata import MicrodataExtractor
-from extruct.xmldom import XmlDomHTMLParser
-
+import extruct
+from extruct import SYNTAXES
 
-def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True):
+def metadata_from_url(url, syntaxes=SYNTAXES):
     resp = requests.get(url, timeout=30)
     result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)}
     try:
         resp.raise_for_status()
     except requests.exceptions.HTTPError:
         return result
-
-    parser = XmlDomHTMLParser(encoding=resp.encoding)
-    tree = lxml.html.fromstring(resp.content, parser=parser)
-
-    if microdata:
-        mde = MicrodataExtractor(nested=True)
-        result['microdata'] = mde.extract_items(tree, resp.url)
-
-    if jsonld:
-        jsonlde = JsonLdExtractor()
-        result['json-ld'] = jsonlde.extract_items(tree, resp.url)
-
-    if rdfa:
-        rdfae = RDFaExtractor()
-        result['rdfa'] = rdfae.extract_items(tree, resp.url)
-
+    result.update(extruct.extract(resp.content, url=url, syntaxes=syntaxes))
     return result
 
 
-def main():
+def main(args=None):
     parser = argparse.ArgumentParser(prog='extruct', description=__doc__)
-    parser.add_argument('url', help='The target URL')
-    parser.add_argument(
-        '--microdata',
-        action='store_true',
-        default=False,
-        help='Extract W3C Microdata from the page.',
-    )
-    parser.add_argument(
-        '--jsonld',
-        action='store_true',
-        default=False,
-        help='Extract JSON-LD metadata from the page.',
-    )
-    parser.add_argument(
-        '--rdfa',
-        action='store_true',
-        default=False,
-        help='Extract RDFa metadata from the page.',
-    )
-    args = parser.parse_args()
-
-    if any((args.microdata, args.jsonld, args.rdfa)):
-        metadata = metadata_from_url(args.url, args.microdata, args.jsonld, args.rdfa)
-    else:
-        metadata = metadata_from_url(args.url)
+    arg = parser.add_argument
+    arg('url', help='The target URL')
+    arg('--syntaxes', nargs='+',
+        choices=SYNTAXES,
+        default=SYNTAXES,
+        help='List of syntaxes to extract. Valid values any or all (default):'
+             'microdata, opengraph, microformat json-ld, rdfa.'
+             'Example: --syntaxes microdata opengraph json-ld')
+    args = parser.parse_args(args)
+    metadata = metadata_from_url(args.url, args.syntaxes)
     return json.dumps(metadata, indent=2, sort_keys=True)
diff --git a/extruct/xmldom.py b/extruct/xmldom.py
@@ -4,7 +4,7 @@
 from xml.dom.minidom import Attr, NamedNodeMap
 
 from lxml.etree import ElementBase, _ElementStringResult, _ElementUnicodeResult, XPath, tostring
-from lxml.html import fromstring, HTMLParser, HtmlElementClassLookup
+from lxml.html import HTMLParser, HtmlElementClassLookup
 
 
 class DomElementUnicodeResult(object):

diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ gevent
 requests
 rdflib
 rdflib-jsonld
+mf2py
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,9 +1,16 @@
 # -*- coding: utf-8 -*-
 import os
+import json
+
 
 tests_datadir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'samples')
 
 def get_testdata(*paths):
     """Return test data"""
     path = os.path.join(tests_datadir, *paths)
-    return open(path, 'rb').read()
+    with open(path, 'rb') as f_in:
+        return f_in.read()
+
+
+def jsonize_dict(d):
+    return json.loads(json.dumps(d))
diff --git a/tests/samples/misc/microformat_test.html b/tests/samples/misc/microformat_test.html
@@ -0,0 +1,38 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="https://www.w3.org/1999/xhtml" xmlns:og="https://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml">
+<head>
+<title>Himanshu's Open Graph Protocol</title>
+<meta http-equiv="Content-Type" content="text/html;charset=WINDOWS-1252" />
+<meta http-equiv="Content-Language" content="en-us" />
+<link rel="stylesheet" type="text/css" href="event-education.css" />
+<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" >
+<meta property="og:title" content="Himanshu's Open Graph Protocol"/>
+<article class="h-entry">
+  <h1 class="p-name">Microformats are amazing</h1>
+  <p>Published by <a class="p-author h-card" href="http://example.com">W. Developer</a>
+     on <time class="dt-published" datetime="2013-06-13 12:00:00">13<sup>th</sup> June 2013</time></p>
+
+  <p class="p-summary">In which I extoll the virtues of using microformats.</p>
+
+  <div class="e-content">
+    <p>Blah blah blah</p>
+  </div>
+</article>
+
+</head>
+
+<body>
+
+<div id="fb-root"></div>
+<script>(function(d, s, id) {
+var js, fjs = d.getElementsByTagName(s)[0];
+if (d.getElementById(id)) return;
+js = d.createElement(s); js.id = id;
+js.src = "//connect.facebook.net/en_US/all.js#xfbml=1&appId=501839739845103";
+fjs.parentNode.insertBefore(js, fjs);
+}(document, 'script', 'facebook-jssdk'));</script>
+.
+.
+.
+</body>
+</html>
diff --git a/tests/samples/misc/microformat_test.json b/tests/samples/misc/microformat_test.json
@@ -0,0 +1,40 @@
+[
+    {
+        "type": [
+            "h-entry"
+        ],
+        "properties": {
+            "name": [
+                "Microformats are amazing"
+            ],
+            "author": [
+                {
+                    "type": [
+                        "h-card"
+                    ],
+                    "properties": {
+                        "name": [
+                            "W. Developer"
+                        ],
+                        "url": [
+                            "http://example.com"
+                        ]
+                    },
+                    "value": "W. Developer"
+                }
+            ],
+            "published": [
+                "2013-06-13 12:00:00"
+            ],
+            "summary": [
+                "In which I extoll the virtues of using microformats."
+            ],
+            "content": [
+                {
+                    "html": "\n<p>Blah blah blah</p>\n",
+                    "value": "\nBlah blah blah\n"
+                }
+            ]
+        }
+    }
+]
diff --git a/tests/samples/misc/opengraph_test.html b/tests/samples/misc/opengraph_test.html
@@ -0,0 +1,33 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
+<html xmlns="https://www.w3.org/1999/xhtml" xmlns:og="https://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml">
+<head>
+<title>Himanshu's Open Graph Protocol</title>
+<meta http-equiv="Content-Type" content="text/html;charset=WINDOWS-1252" />
+<meta http-equiv="Content-Language" content="en-us" />
+<link rel="stylesheet" type="text/css" href="event-education.css" />
+<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" >
+<meta property="og:title" content="Himanshu's Open Graph Protocol"/>
+<meta property="og:type" content="article"/>
+<meta property="og:url" content="https://www.eventeducation.com/test.php"/>
+<meta property="og:image" content="https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"/>
+<meta property="fb:admins" content="himanshu160"/>
+<meta property="og:site_name" content="Event Education"/>
+<meta property="og:description" content="Event Education provides free courses on event planning and management to event professionals worldwide."/>
+
+</head>
+
+<body>
+
+<div id="fb-root"></div>
+<script>(function(d, s, id) {
+var js, fjs = d.getElementsByTagName(s)[0];
+if (d.getElementById(id)) return;
+js = d.createElement(s); js.id = id;
+js.src = "//connect.facebook.net/en_US/all.js#xfbml=1&appId=501839739845103";
+fjs.parentNode.insertBefore(js, fjs);
+}(document, 'script', 'facebook-jssdk'));</script>
+.
+.
+.
+</body>
+</html>
diff --git a/tests/samples/misc/opengraph_test.json b/tests/samples/misc/opengraph_test.json
@@ -0,0 +1,33 @@
+[
+    {
+        "namespace": {
+            "og": "http://ogp.me/ns#"
+        },
+        "properties": [
+            [
+                "og:title",
+                "Himanshu's Open Graph Protocol"
+            ],
+            [
+                "og:type",
+                "article"
+            ],
+            [
+                "og:url",
+                "https://www.eventeducation.com/test.php"
+            ],
+            [
+                "og:image",
+                "https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"
+            ],
+            [
+                "og:site_name",
+                "Event Education"
+            ],
+            [
+                "og:description",
+                "Event Education provides free courses on event planning and management to event professionals worldwide."
+            ]
+        ]
+    }
+]