Skip to content

Commit

Permalink
Merge pull request #38 from scrapinghub/infer-domain
Browse files Browse the repository at this point in the history
Infer domain
  • Loading branch information
kmike committed Apr 6, 2017
2 parents 628c8c2 + 1856b46 commit 97d6d37
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pip-log.txt
cover
nosetests.xml
.cache
htmlcov/

# Translations
*.mo
Expand Down
8 changes: 5 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,15 @@ def __getattr__(cls, name):
else:
return Mock()


MOCK_MODULES = [
'lxml', 'lxml.html', 'lxml.html.clean', 'lxml.etree', 'lxml.sax',
'sklearn', 'sklearn.base', 'sklearn.metrics', 'sklearn.pipeline',
'numpy'
'numpy', 'tldextract',
]
for mod_name in MOCK_MODULES:
sys.modules[mod_name] = Mock()

sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)


# -- General configuration ------------------------------------------------

Expand Down
7 changes: 7 additions & 0 deletions docs/ref/misc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,10 @@ Sequence Encoding
.. automodule:: webstruct.sequence_encoding
:members:
:undoc-members:


Webpage domain inferring
------------------------

.. automodule:: webstruct.infer_domain
:members:
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ scikit-learn >= 0.14
-e git+https://github.com/adsva/python-wapiti.git@f7eded69a951c50a461cbe62ecc809e28229eb8f#egg=python-wapiti
python-crfsuite >= 0.8.4
sklearn-crfsuite >= 0.3.3
tldextract
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
],
install_requires=['six', 'lxml', 'scikit-learn'],
install_requires=['six', 'lxml', 'scikit-learn', 'tldextract'],
)
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ envlist = py27, py34, py35, docs
deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/requirements-dev.txt
commands = py.test --cov=webstruct {posargs:webstruct}
commands = py.test --cov=webstruct --cov-report=html --cov-report=term {posargs:webstruct}

[testenv:docs]
deps = -r{toxinidir}/requirements-doc.txt
Expand Down
73 changes: 73 additions & 0 deletions webstruct/infer_domain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
"""
Module for getting a most likely base URL (domain) for a page.
It is useful if you've downloaded HTML files, but haven't preserved URLs
explicitly, and still want to have cross-validation done right.
Grouping pages by domain name is a reasonable way to do that.
WebAnnotator data has either <base> tags with original URLs
(or at least original domains), or a commented out base tags.
Unfortunately, GATE-annotated data doesn't have this tag.
So the idea is to use a most popular domain mentioned in a page as
a page's domain.
"""
import re
from collections import Counter

from webstruct.utils import get_domain


_find_base_href = re.compile(r'base\s+href="(.*)"').search
_DOMAIN_BLACKLIST = {
'google.com', 'twitter.com', 'facebook.com', 'youtube.com',
'fonts.com', 'googleapis.com', 'fonts.net', 'addthis.com',
'flickr.com', 'paypal.com', 'pinterest.com', 'linkedin.com',
}


def get_tree_domain(tree, blacklist=_DOMAIN_BLACKLIST, get_domain=get_domain):
"""
Return the most likely domain for the tree. Domain is extracted from base
tag or guessed if there is no base tag. If domain can't be detected an
empty string is returned.
"""
href = get_base_href(tree)
if href:
return get_domain(href)
return guess_domain(tree, blacklist, get_domain)


def guess_domain(tree, blacklist=_DOMAIN_BLACKLIST, get_domain=get_domain):
""" Return most common domain not in a black list. """
domains = [get_domain(href) for href in tree.xpath('//*/@href')]
domains = [d for d in domains if d and d not in blacklist]
if not domains:
return '' # unknown
cnt = Counter(domains)
max_count = cnt.most_common(1)[0][1]
top_domains = [k for k, v in cnt.items() if v == max_count]
return sorted(top_domains)[0]


def get_base_href(tree):
""" Return href of a base tag; base tag could be commented out. """
href = _get_base_href(tree)
if href:
return href
return _get_commented_base_href(tree)


def _get_commented_base_href(tree):
""" Return href value found in a commented out <base> tag """
for comment in tree.xpath('//head/comment()'):
m = _find_base_href(comment.text)
if m:
return m.group(1)


def _get_base_href(tree):
""" Return href value of a base tag """
base_hrefs = tree.xpath('//base/@href')
if base_hrefs:
return base_hrefs[0]
71 changes: 71 additions & 0 deletions webstruct/tests/test_infer_domain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import

from webstruct.infer_domain import guess_domain, get_tree_domain, get_base_href
from webstruct.loaders import HtmlLoader


def _load(html_bytes):
ld = HtmlLoader()
return ld.loadbytes(html_bytes)


def test_guess_domain():
tree = _load(b"""
<html>
<body>
<a href="https://twitter.com/">share</a>
<a href="http://example2.com/baz">baz</a>
<a href="http://example.com/foo">foo</a>
<a href="http://foo.example.com/bar">bar</a>
</body>
</html>
""")
assert guess_domain(tree) == "example.com"
assert get_tree_domain(tree) == "example.com"
assert get_base_href(tree) is None


def test_baseurl():
tree = _load(b"""
<html>
<head>
<base href="http://example.org/foo"/>
</head>
<body>
<a href="https://twitter.com/">share</a>
<a href="http://example2.com/baz">baz</a>
<a href="http://example.com/foo">foo</a>
<a href="http://foo.example.com/bar">bar</a>
</body>
</html>
""")
assert guess_domain(tree) == "example.com"
assert get_base_href(tree) == "http://example.org/foo"
assert get_tree_domain(tree) == "example.org"


def test_commented_baseurl():
tree = _load(b"""
<html>
<head>
<!--base href="http://example.org/foo"/-->
</head>
<body>
<a href="https://twitter.com/">share</a>
<a href="http://example2.com/baz">baz</a>
<a href="http://example.com/foo">foo</a>
<a href="http://foo.example.com/bar">bar</a>
</body>
</html>
""")
assert guess_domain(tree) == "example.com"
assert get_base_href(tree) == "http://example.org/foo"
assert get_tree_domain(tree) == "example.org"


def test_no_links():
tree = _load(b"""<html><body><p>empty</p></body></html>""")
assert guess_domain(tree) == ""
assert get_base_href(tree) is None
assert get_tree_domain(tree) == ""
14 changes: 14 additions & 0 deletions webstruct/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from functools import partial
from itertools import chain
from six.moves import range

import tldextract
import lxml.html
from lxml.etree import iterwalk

Expand Down Expand Up @@ -308,3 +310,15 @@ def train_test_split_noshuffle(*arrays, **options):
return list(chain.from_iterable(
(a[:-test_size], a[-test_size:]) for a in arrays
))


def get_domain(url):
"""
>>> get_domain("http://example.com/path")
'example.com'
>>> get_domain("https://hello.example.com/foo/bar")
'example.com'
>>> get_domain("http://hello.example.co.uk/foo?bar=1")
'example.co.uk'
"""
return tldextract.extract(url).registered_domain

0 comments on commit 97d6d37

Please sign in to comment.