Skip to content

Commit

Permalink
Merge pull request #12 from mborho/master
Browse files Browse the repository at this point in the history
Robustness
  • Loading branch information
psolbach committed Apr 16, 2018
2 parents 769ddbc + 75dd34e commit 7641ed7
Show file tree
Hide file tree
Showing 24 changed files with 17,345 additions and 129 deletions.
2 changes: 1 addition & 1 deletion metadoc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__copyright__ = 'Copyright 2016, Paul Solbach'
__author__ = 'Paul Solbach'
__license__ = 'MIT'
__version__ = '0.6.0'
__version__ = '0.7.0'

import asyncio
import time
Expand Down
17 changes: 6 additions & 11 deletions metadoc/extract/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,20 +82,16 @@ def extract_text(self):
def extract_metadata(self):
"""Sniff for essential and additional metadata via
either metatags and or json-ld"""

html_meta = HtmlMeta(self.html, tree=self.tree)
html_meta.extract()

self.authors = html_meta.jsonld.get("authors") \
or html_meta.metatags.get("article:author") \
or html_meta.metatags.get("author")

self.title = html_meta.jsonld.get("headline") or html_meta.title
self.description = html_meta.metatags.get("description")
self.canonical_url = html_meta.links.get("canonical")
self.image = html_meta.metatags.get("og:image") or html_meta.jsonld.get("thumbnailUrl")
# data
self.authors = html_meta.authors
self.title = html_meta.title
self.description = html_meta.description
self.canonical_url = html_meta.canonical_url
self.image = html_meta.image
self.published_date = html_meta.published_date

self.modified_date = html_meta.modified_date
self.scraped_date = html_meta.scraped_date

Expand All @@ -122,4 +118,3 @@ def get_all(self):
self.get_contenthash()
self.get_reading_time()
logging.info("--- extraction module %s seconds ---" % (time.time() - start_time))
return
162 changes: 115 additions & 47 deletions metadoc/extract/html.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import logging
import lxml.etree, lxml.html
from datetime import datetime
from dateutil.parser import parse
Expand All @@ -24,9 +25,106 @@ def __init__(self, html, encoding="UTF-8", tree=None):
self._metatag_xpath = lxml.etree.XPath("//meta")
self._links_xpath = lxml.etree.XPath("//link")

self.links = None
self.links = {}
self.jsonld = {}
self.links = None
self.metatags = {}

@property
def title(self):
return self.jsonld.get("headline") \
or self.metatags.get("og:title") \
or self.extract_title()

@property
def description(self):
return self.metatags.get("og:description") \
or self.metatags.get("description", "").strip()

@property
def canonical_url(self):
return self.links.get("canonical")

@property
def image(self):
return self.metatags.get("og:image") \
or self.jsonld.get("thumbnailUrl")

def _extract_ld_authors(self):
# extract from jsonld
ld_authors = self.jsonld.get("author", {})
# sanitize ld structure
if type(ld_authors) == str:
ld_authors = {"name": ld_authors}
ld_authors = [a["name"] for a in ld_authors] if type(ld_authors) == list else ld_authors.get("name", False)
return ld_authors

@property
def authors(self):
# get a value from trove
authors = self._extract_ld_authors() \
or self.metatags.get("author") \
or self.metatags.get("article:author") \
or self.metatags.get("dcterms.creator") \
or self.metatags.get("article:authorName") \
or self.metatags.get("citation_author") \
or self.jsonld.get("authors") # intercept

if authors:
# ensure list
if type(authors) != list:
authors = [authors]
# strip links
authors = [a for a in authors if a.startswith("http") == False]

if not authors:
# washingtonpost
xauthors = self.document.xpath("(//span[@itemprop='author'])[1]//span[@itemprop='name']/text()")
if xauthors:
authors = xauthors

return authors if authors else []

@property
def published_date(self):
res = None
xpaths = [
"//meta[@name='date']/@content",
"//meta[@property='article:published_time']/@content",
"//meta[@property='article:published']/@content",
"//meta[@name='parsely-pub-date']/@content",
"//meta[@name='DC.date.issued']/@content",
"//time[@itemprop='datePublished']/@datetime",
]
res = self._query_date(xpaths)
if res is None:
ld_date = self.jsonld.get("datePublished") or self.jsonld.get("dateCreated")
if ld_date:
res = self._format_date(ld_date)
return res

@property
def modified_date(self):
res = None
xpaths = [
"//meta[@property='article:modified_time']/@content",
"//meta[@property='article:modified']/@content",
"//meta[@name='last-modified']/@content",
]
res = self._query_date(xpaths)
if res is None:
ld_date = self.jsonld.get("dateModified")
if ld_date:
res = self._format_date(ld_date)
return res

@property
def scraped_date(self):
return self._format_date(datetime.now())

def extract(self):
self.metatags = self._extract_items(self._get_metatag_item, self._metatag_xpath)
self.jsonld = self._extract_items(self._get_jsonld_item, self._jsonld_xpath)
self.links = self._extract_items(self._get_link_item, self._links_xpath)

def _extract_items(self, get_item, xpath):
items = [item for item in map(get_item, xpath(self.document)) if item]
Expand All @@ -47,10 +145,20 @@ def _get_link_item(self, node):
if (name and content) else None

def _get_jsonld_item(self, node):
ld = json.loads(node.text.strip())
if type(ld) is list:
for item in[i for i in ld if i.get("@type") == "NewsArticle"]:
return item
ld = None
try:
ld_text = node.text.strip()
# sanitize if neccessary
if ld_text.find("<![CDATA[") > -1:
ld_text = ld_text[ld_text.find("{"):ld_text.rfind("}")+1]

ld = json.loads(ld_text)
if type(ld) is list:
for item in[i for i in ld if i.get("@type") == "NewsArticle"]:
return item
except Exception as exc:
logging.error("JSON-LD parsing failed")
logging.exception(exc)
return ld if ld else {}

def extract_title(self):
Expand All @@ -67,47 +175,7 @@ def _query_date(self, xpath_rules):
dates = self.document.xpath(xpath_rule)
if len(dates) > 0:
try:
return self._format_date(dates[0].get("content"))
return self._format_date(str(dates[0]))#.get("content"))
except:
pass
return None

def extract_pub_date(self):
res = None
xpaths = [
"//meta[@name='date']",
"//meta[@property='article:published_time']",
"//meta[@property='article:published']",
"//meta[@name='parsely-pub-date']",
"//meta[@name='DC.date.issued']",
]
res = self._query_date(xpaths)
if res is None:
ld_date = self.jsonld.get("datePublished") or self.jsonld.get("dateCreated")
if ld_date:
res = self._format_date(ld_date)
return res

def extract_mod_date(self):
res = None
xpaths = [
"//meta[@property='article:modified_time']",
"//meta[@property='article:modified']",
"//meta[@name='last-modified']",
]
res = self._query_date(xpaths)
if res is None:
ld_date = self.jsonld.get("dateModified")
if ld_date:
res = self._format_date(ld_date)
return res

def extract(self):
self.metatags = self._extract_items(self._get_metatag_item, self._metatag_xpath)
self.jsonld = self._extract_items(self._get_jsonld_item, self._jsonld_xpath)
self.links = self._extract_items(self._get_link_item, self._links_xpath)
self.title = self.extract_title()
self.published_date = self.extract_pub_date()
self.modified_date = self.extract_mod_date()
self.scraped_date = self._format_date(datetime.now())
return
9 changes: 5 additions & 4 deletions metadoc/extract/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@

import imp
import sys
import os

# overwrite sqlite with dummy modules, for AWS Lambda
sys.modules["sqlite"] = imp.new_module("sqlite")
sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
if os.environ.get("LAMBDA_TASK_ROOT", False):
# overwrite sqlite with dummy modules, for AWS Lambda
sys.modules["sqlite"] = imp.new_module("sqlite")
sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
import nltk

import difflib
import operator
import os
import numpy
import string
import re
Expand Down
32 changes: 18 additions & 14 deletions metadoc/social/activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,22 @@ async def get_json(self, url):
return await response.read()

async def collect_sharecount(self, url, provider):
response = await self.get_json(url)
j = json.loads(response)
try:
response = await self.get_json(url)
j = json.loads(response)

data = {
"provider": provider["provider"],
"metrics": []
}

for m in provider["metrics"]:
data["metrics"].append({
"count": jmespath.search(m["path"], j),
"label": m["label"]
})
self.responses.append(data)
except Exception as exc:
logging.error("Collecting sharecount failed!")
logging.exception(exc)

data = {
"provider": provider["provider"],
"metrics": []
}

for m in provider["metrics"]:
data["metrics"].append({
"count": jmespath.search(m["path"], j),
"label": m["label"]
})

self.responses.append(data)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def run(self):
author_email='p@psolbach.com',
url='https://github.com/psolbach/metadoc',
license=metadata["license"],
cmdclass={'install': CustomInstall, 'develop': DevInstall},
cmdclass={'install': CustomInstall, 'develop': DevInstall, 'bdist_wheel': CustomInstall},
packages=find_packages(exclude=['tests']),
include_package_data=True,
zip_safe=False
Expand Down
3 changes: 0 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from metadoc.extract.pos import do_train
do_train()

0 comments on commit 7641ed7

Please sign in to comment.