diff --git a/.gitignore b/.gitignore index 0652972..3910db7 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ dist venv htmlcov *.swp +venv36 +metadoc/extract/data/* +.pytest_cache \ No newline at end of file diff --git a/README.md b/README.md index 46533a0..e728940 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,13 @@ python metadoc/__install__.py python serve.py => serving @ 6060 ``` +## Test +```shell +py.test -v tests +``` +If you happen to run into an error with OSX 10.11 concerning a lazy bound library in PIL, +just remove `/PIL/.dylibs/liblzma.5.dylib`. + ## Todo * Page concatenation is needed in order to properly calculate wordcount and reading time. * Authenticity heuristic with sharecount deviance detection (requires state). @@ -118,4 +125,3 @@ python serve.py => serving @ 6060 Metadoc stems from a pedigree of nice libraries like [libextract](https://github.com/datalib/libextract), [langdetect](https://github.com/Mimino666/langdetect) and [nltk](https://github.com/nltk/nltk). Metadoc leans on [this](https://github.com/hankcs/AveragedPerceptronPython) perceptron implementation inspired by Matthew Honnibal. Metadoc is work-in-progress and maintained by [@___paul](https://twitter.com/___paul) - diff --git a/metadoc/__init__.py b/metadoc/__init__.py index f34d8b8..f2281b8 100644 --- a/metadoc/__init__.py +++ b/metadoc/__init__.py @@ -181,7 +181,7 @@ def _request_url(self): req = requests.get(url, headers={ 'Accept-Encoding': 'identity, gzip, deflate, *', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' + 'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)' }) if req.status_code != 200: diff --git a/serve.py b/serve.py index b153579..714b6fb 100644 --- a/serve.py +++ b/serve.py @@ -58,7 +58,7 @@ def full_article(): abort(404) metadoc = Metadoc(url=url, html=html) - payload = metadoc.query_all() + payload = metadoc.query() return json.dumps(payload)