Merge pull request #12 from mborho/master

Robustness
psolbach · Apr 16, 2018 · 7641ed7 · 7641ed7
2 parents 769ddbc + 75dd34e
commit 7641ed7
Show file tree

Hide file tree

Showing 24 changed files with 17,345 additions and 129 deletions.
diff --git a/metadoc/__init__.py b/metadoc/__init__.py
@@ -5,7 +5,7 @@
 __copyright__ = 'Copyright 2016, Paul Solbach'
 __author__ = 'Paul Solbach'
 __license__ = 'MIT'
-__version__ = '0.6.0'
+__version__ = '0.7.0'
 
 import asyncio
 import time

diff --git a/metadoc/extract/extractor.py b/metadoc/extract/extractor.py
@@ -82,20 +82,16 @@ def extract_text(self):
   def extract_metadata(self):
     """Sniff for essential and additional metadata via
     either metatags and or json-ld"""
-
     html_meta = HtmlMeta(self.html, tree=self.tree)
     html_meta.extract()
 
-    self.authors = html_meta.jsonld.get("authors") \
-      or html_meta.metatags.get("article:author") \
-      or html_meta.metatags.get("author")
-
-    self.title = html_meta.jsonld.get("headline") or html_meta.title
-    self.description = html_meta.metatags.get("description")
-    self.canonical_url = html_meta.links.get("canonical")
-    self.image = html_meta.metatags.get("og:image") or html_meta.jsonld.get("thumbnailUrl")
+    # data
+    self.authors = html_meta.authors
+    self.title = html_meta.title
+    self.description = html_meta.description
+    self.canonical_url = html_meta.canonical_url
+    self.image = html_meta.image
     self.published_date = html_meta.published_date
-
     self.modified_date = html_meta.modified_date
     self.scraped_date = html_meta.scraped_date
 
@@ -122,4 +118,3 @@ def get_all(self):
     self.get_contenthash()
     self.get_reading_time()
     logging.info("--- extraction module %s seconds ---" % (time.time() - start_time))
-    return
diff --git a/metadoc/extract/html.py b/metadoc/extract/html.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import json
+import logging
 import lxml.etree, lxml.html
 from datetime import datetime
 from dateutil.parser import parse
@@ -24,9 +25,106 @@ def __init__(self, html, encoding="UTF-8", tree=None):
         self._metatag_xpath = lxml.etree.XPath("//meta")
         self._links_xpath = lxml.etree.XPath("//link")
 
-        self.links = None
+        self.links = {}
         self.jsonld = {}
-        self.links = None
+        self.metatags = {}
+
+    @property
+    def title(self):
+        return self.jsonld.get("headline") \
+            or self.metatags.get("og:title") \
+                or self.extract_title()
+
+    @property
+    def description(self):
+        return self.metatags.get("og:description") \
+            or self.metatags.get("description", "").strip()
+
+    @property
+    def canonical_url(self):
+        return self.links.get("canonical")
+
+    @property
+    def image(self):
+        return self.metatags.get("og:image") \
+            or self.jsonld.get("thumbnailUrl")
+
+    def _extract_ld_authors(self):
+        # extract from jsonld
+        ld_authors = self.jsonld.get("author", {})
+        # sanitize ld structure
+        if type(ld_authors) == str:
+            ld_authors = {"name": ld_authors}
+        ld_authors = [a["name"] for a in ld_authors] if type(ld_authors) == list else ld_authors.get("name", False)
+        return ld_authors
+
+    @property
+    def authors(self):
+        # get a value from trove
+        authors = self._extract_ld_authors() \
+            or self.metatags.get("author") \
+                or self.metatags.get("article:author") \
+                    or self.metatags.get("dcterms.creator") \
+                        or self.metatags.get("article:authorName") \
+                            or self.metatags.get("citation_author") \
+                                or self.jsonld.get("authors") # intercept
+
+        if authors:
+            # ensure list
+            if type(authors) != list:
+                authors = [authors]
+            # strip links
+            authors = [a for a in authors if a.startswith("http") == False]
+
+        if not authors:
+            # washingtonpost
+            xauthors = self.document.xpath("(//span[@itemprop='author'])[1]//span[@itemprop='name']/text()")
+            if xauthors:
+                authors = xauthors
+
+        return authors if authors else []
+
+    @property
+    def published_date(self):
+        res = None
+        xpaths = [
+            "//meta[@name='date']/@content",
+            "//meta[@property='article:published_time']/@content",
+            "//meta[@property='article:published']/@content",
+            "//meta[@name='parsely-pub-date']/@content",
+            "//meta[@name='DC.date.issued']/@content",
+            "//time[@itemprop='datePublished']/@datetime",
+        ]
+        res = self._query_date(xpaths)
+        if res is None:
+            ld_date = self.jsonld.get("datePublished") or self.jsonld.get("dateCreated")
+            if ld_date:
+                res = self._format_date(ld_date)
+        return res
+
+    @property
+    def modified_date(self):
+        res = None
+        xpaths = [
+            "//meta[@property='article:modified_time']/@content",
+            "//meta[@property='article:modified']/@content",
+            "//meta[@name='last-modified']/@content",
+        ]
+        res = self._query_date(xpaths)
+        if res is None:
+            ld_date = self.jsonld.get("dateModified")
+            if ld_date:
+                res = self._format_date(ld_date)
+        return res
+
+    @property
+    def scraped_date(self):
+        return self._format_date(datetime.now())
+
+    def extract(self):
+        self.metatags = self._extract_items(self._get_metatag_item, self._metatag_xpath)
+        self.jsonld = self._extract_items(self._get_jsonld_item, self._jsonld_xpath)
+        self.links = self._extract_items(self._get_link_item, self._links_xpath)
 
     def _extract_items(self, get_item, xpath):
         items = [item for item in map(get_item, xpath(self.document)) if item]
@@ -47,10 +145,20 @@ def _get_link_item(self, node):
           if (name and content) else None
 
     def _get_jsonld_item(self, node):
-        ld = json.loads(node.text.strip())
-        if type(ld) is list:
-            for item in[i for i in ld if i.get("@type") == "NewsArticle"]:
-                return item
+        ld = None
+        try:
+            ld_text = node.text.strip()
+            # sanitize if neccessary
+            if ld_text.find("<![CDATA[") > -1:
+                ld_text = ld_text[ld_text.find("{"):ld_text.rfind("}")+1]
+
+            ld = json.loads(ld_text)
+            if type(ld) is list:
+                for item in[i for i in ld if i.get("@type") == "NewsArticle"]:
+                    return item
+        except Exception as exc:
+            logging.error("JSON-LD parsing failed")
+            logging.exception(exc)
         return ld if ld else {}
 
     def extract_title(self):
@@ -67,47 +175,7 @@ def _query_date(self, xpath_rules):
             dates = self.document.xpath(xpath_rule)
             if len(dates) > 0:
                 try:
-                    return self._format_date(dates[0].get("content"))
+                    return self._format_date(str(dates[0]))#.get("content"))
                 except:
                     pass
         return None
-
-    def extract_pub_date(self):
-        res = None
-        xpaths = [
-            "//meta[@name='date']",
-            "//meta[@property='article:published_time']",
-            "//meta[@property='article:published']",
-            "//meta[@name='parsely-pub-date']",
-            "//meta[@name='DC.date.issued']",
-        ]
-        res = self._query_date(xpaths)
-        if res is None:
-            ld_date = self.jsonld.get("datePublished") or self.jsonld.get("dateCreated")
-            if ld_date:
-                res = self._format_date(ld_date)
-        return res
-
-    def extract_mod_date(self):
-        res = None
-        xpaths = [
-            "//meta[@property='article:modified_time']",
-            "//meta[@property='article:modified']",
-            "//meta[@name='last-modified']",
-        ]
-        res = self._query_date(xpaths)
-        if res is None:
-            ld_date = self.jsonld.get("dateModified")
-            if ld_date:
-                res = self._format_date(ld_date)
-        return res
-
-    def extract(self):
-        self.metatags = self._extract_items(self._get_metatag_item, self._metatag_xpath)
-        self.jsonld = self._extract_items(self._get_jsonld_item, self._jsonld_xpath)
-        self.links = self._extract_items(self._get_link_item, self._links_xpath)
-        self.title = self.extract_title()
-        self.published_date = self.extract_pub_date()
-        self.modified_date = self.extract_mod_date()
-        self.scraped_date = self._format_date(datetime.now())
-        return
diff --git a/metadoc/extract/ner.py b/metadoc/extract/ner.py
@@ -3,15 +3,16 @@
 
 import imp
 import sys
+import os
 
-# overwrite sqlite with dummy modules, for AWS Lambda
-sys.modules["sqlite"] = imp.new_module("sqlite")
-sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
+if os.environ.get("LAMBDA_TASK_ROOT", False):
+    # overwrite sqlite with dummy modules, for AWS Lambda
+    sys.modules["sqlite"] = imp.new_module("sqlite")
+    sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
 import nltk
 
 import difflib
 import operator
-import os
 import numpy
 import string
 import re

diff --git a/metadoc/social/activity.py b/metadoc/social/activity.py
@@ -35,18 +35,22 @@ async def get_json(self, url):
                 return await response.read()
 
     async def collect_sharecount(self, url, provider):
-        response = await self.get_json(url)
-        j = json.loads(response)
+        try:
+            response = await self.get_json(url)
+            j = json.loads(response)
+
+            data = {
+                "provider": provider["provider"],
+                "metrics": []
+            }
+
+            for m in provider["metrics"]:
+                data["metrics"].append({
+                "count": jmespath.search(m["path"], j),
+                "label": m["label"]
+                })
+            self.responses.append(data)
+        except Exception as exc:
+            logging.error("Collecting sharecount failed!")
+            logging.exception(exc)
 
-        data = {
-            "provider": provider["provider"],
-            "metrics": []
-        }
-
-        for m in provider["metrics"]:
-            data["metrics"].append({
-            "count": jmespath.search(m["path"], j),
-            "label": m["label"]
-            })
-
-        self.responses.append(data)
diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@ def run(self):
     author_email='p@psolbach.com',
     url='https://github.com/psolbach/metadoc',
     license=metadata["license"],
-    cmdclass={'install': CustomInstall, 'develop': DevInstall},
+    cmdclass={'install': CustomInstall, 'develop': DevInstall, 'bdist_wheel': CustomInstall},
     packages=find_packages(exclude=['tests']),
     include_package_data=True,
     zip_safe=False

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,5 +1,2 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-
-from metadoc.extract.pos import do_train
-do_train()