Merge pull request #38 from scrapinghub/infer-domain

Infer domain
scrapinghub · Apr 6, 2017 · 97d6d37 · 97d6d37
2 parents 628c8c2 + 1856b46
commit 97d6d37
Show file tree

Hide file tree

Showing 9 changed files with 174 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -27,6 +27,7 @@ pip-log.txt
 cover
 nosetests.xml
 .cache
+htmlcov/
 
 # Translations
 *.mo

diff --git a/docs/conf.py b/docs/conf.py
@@ -45,13 +45,15 @@ def __getattr__(cls, name):
         else:
             return Mock()
 
+
 MOCK_MODULES = [
     'lxml', 'lxml.html', 'lxml.html.clean', 'lxml.etree', 'lxml.sax',
     'sklearn', 'sklearn.base', 'sklearn.metrics', 'sklearn.pipeline',
-    'numpy'
+    'numpy', 'tldextract',
 ]
-for mod_name in MOCK_MODULES:
-    sys.modules[mod_name] = Mock()
+
+sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
+
 
 # -- General configuration ------------------------------------------------
 

diff --git a/docs/ref/misc.rst b/docs/ref/misc.rst
@@ -29,3 +29,10 @@ Sequence Encoding
 .. automodule:: webstruct.sequence_encoding
     :members:
     :undoc-members:
+
+
+Webpage domain inferring
+------------------------
+
+.. automodule:: webstruct.infer_domain
+    :members:
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ scikit-learn >= 0.14
 -e git+https://github.com/adsva/python-wapiti.git@f7eded69a951c50a461cbe62ecc809e28229eb8f#egg=python-wapiti
 python-crfsuite >= 0.8.4
 sklearn-crfsuite >= 0.3.3
+tldextract
diff --git a/setup.py b/setup.py
@@ -28,5 +28,5 @@
         "Programming Language :: Python :: 3.4",
         "Programming Language :: Python :: 3.5",
     ],
-    install_requires=['six', 'lxml', 'scikit-learn'],
+    install_requires=['six', 'lxml', 'scikit-learn', 'tldextract'],
 )
diff --git a/tox.ini b/tox.ini
@@ -5,7 +5,7 @@ envlist = py27, py34, py35, docs
 deps =
     -r{toxinidir}/requirements.txt
     -r{toxinidir}/requirements-dev.txt
-commands = py.test --cov=webstruct {posargs:webstruct}
+commands = py.test --cov=webstruct --cov-report=html --cov-report=term {posargs:webstruct}
 
 [testenv:docs]
 deps = -r{toxinidir}/requirements-doc.txt

diff --git a/webstruct/infer_domain.py b/webstruct/infer_domain.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""
+Module for getting a most likely base URL (domain) for a page.
+It is useful if you've downloaded HTML files, but haven't preserved URLs
+explicitly, and still want to have cross-validation done right.
+Grouping pages by domain name is a reasonable way to do that.
+
+WebAnnotator data has either <base> tags with original URLs 
+(or at least original domains), or a commented out base tags.
+
+Unfortunately, GATE-annotated data doesn't have this tag. 
+So the idea is to use a most popular domain mentioned in a page as 
+a page's domain.
+"""
+import re
+from collections import Counter
+
+from webstruct.utils import get_domain
+
+
+_find_base_href = re.compile(r'base\s+href="(.*)"').search
+_DOMAIN_BLACKLIST = {
+    'google.com', 'twitter.com', 'facebook.com', 'youtube.com',
+    'fonts.com', 'googleapis.com', 'fonts.net', 'addthis.com',
+    'flickr.com', 'paypal.com', 'pinterest.com', 'linkedin.com',
+}
+
+
+def get_tree_domain(tree, blacklist=_DOMAIN_BLACKLIST, get_domain=get_domain):
+    """
+    Return the most likely domain for the tree. Domain is extracted from base
+    tag or guessed if there is no base tag. If domain can't be detected an
+    empty string is returned.
+    """
+    href = get_base_href(tree)
+    if href:
+        return get_domain(href)
+    return guess_domain(tree, blacklist, get_domain)
+
+
+def guess_domain(tree, blacklist=_DOMAIN_BLACKLIST, get_domain=get_domain):
+    """ Return most common domain not in a black list. """
+    domains = [get_domain(href) for href in tree.xpath('//*/@href')]
+    domains = [d for d in domains if d and d not in blacklist]
+    if not domains:
+        return ''  # unknown
+    cnt = Counter(domains)
+    max_count = cnt.most_common(1)[0][1]
+    top_domains = [k for k, v in cnt.items() if v == max_count]
+    return sorted(top_domains)[0]
+
+
+def get_base_href(tree):
+    """ Return href of a base tag; base tag could be commented out. """
+    href = _get_base_href(tree)
+    if href:
+        return href
+    return _get_commented_base_href(tree)
+
+
+def _get_commented_base_href(tree):
+    """ Return href value found in a commented out <base> tag """
+    for comment in tree.xpath('//head/comment()'):
+        m = _find_base_href(comment.text)
+        if m:
+            return m.group(1)
+
+
+def _get_base_href(tree):
+    """ Return href value of a base tag """
+    base_hrefs = tree.xpath('//base/@href')
+    if base_hrefs:
+        return base_hrefs[0]
diff --git a/webstruct/tests/test_infer_domain.py b/webstruct/tests/test_infer_domain.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from webstruct.infer_domain import guess_domain, get_tree_domain, get_base_href
+from webstruct.loaders import HtmlLoader
+
+
+def _load(html_bytes):
+    ld = HtmlLoader()
+    return ld.loadbytes(html_bytes)
+
+
+def test_guess_domain():
+    tree = _load(b"""
+    <html>
+        <body>
+            <a href="https://twitter.com/">share</a>
+            <a href="http://example2.com/baz">baz</a>
+            <a href="http://example.com/foo">foo</a>
+            <a href="http://foo.example.com/bar">bar</a>
+        </body>
+    </html>
+    """)
+    assert guess_domain(tree) == "example.com"
+    assert get_tree_domain(tree) == "example.com"
+    assert get_base_href(tree) is None
+
+
+def test_baseurl():
+    tree = _load(b"""
+    <html>
+        <head>
+            <base  href="http://example.org/foo"/>
+        </head>
+        <body>
+            <a href="https://twitter.com/">share</a>
+            <a href="http://example2.com/baz">baz</a>
+            <a href="http://example.com/foo">foo</a>
+            <a href="http://foo.example.com/bar">bar</a>
+        </body>
+    </html>
+    """)
+    assert guess_domain(tree) == "example.com"
+    assert get_base_href(tree) == "http://example.org/foo"
+    assert get_tree_domain(tree) == "example.org"
+
+
+def test_commented_baseurl():
+    tree = _load(b"""
+    <html>
+        <head>
+            <!--base  href="http://example.org/foo"/-->
+        </head>
+        <body>
+            <a href="https://twitter.com/">share</a>
+            <a href="http://example2.com/baz">baz</a>
+            <a href="http://example.com/foo">foo</a>
+            <a href="http://foo.example.com/bar">bar</a>
+        </body>
+    </html>
+    """)
+    assert guess_domain(tree) == "example.com"
+    assert get_base_href(tree) == "http://example.org/foo"
+    assert get_tree_domain(tree) == "example.org"
+
+
+def test_no_links():
+    tree = _load(b"""<html><body><p>empty</p></body></html>""")
+    assert guess_domain(tree) == ""
+    assert get_base_href(tree) is None
+    assert get_tree_domain(tree) == ""
diff --git a/webstruct/utils.py b/webstruct/utils.py
@@ -5,6 +5,8 @@
 from functools import partial
 from itertools import chain
 from six.moves import range
+
+import tldextract
 import lxml.html
 from lxml.etree import iterwalk
 
@@ -308,3 +310,15 @@ def train_test_split_noshuffle(*arrays, **options):
     return list(chain.from_iterable(
         (a[:-test_size], a[-test_size:]) for a in arrays
     ))
+
+
+def get_domain(url):
+    """
+    >>> get_domain("http://example.com/path")
+    'example.com'
+    >>> get_domain("https://hello.example.com/foo/bar")
+    'example.com'
+    >>> get_domain("http://hello.example.co.uk/foo?bar=1")
+    'example.co.uk'
+    """
+    return tldextract.extract(url).registered_domain