Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] CSS support in link extractors #983

Merged
merged 4 commits into from
Mar 17, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/topics/link-extractors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ LxmlLinkExtractor
:synopsis: lxml's HTMLParser-based link extractors


.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)

LxmlLinkExtractor is the recommended link extractor with handy filtering
options. It is implemented using lxml's robust HTMLParser.
Expand Down Expand Up @@ -88,6 +88,11 @@ LxmlLinkExtractor
links. See examples below.
:type restrict_xpaths: str or list

:param restrict_css: a CSS selector (or list of selectors) which defines
regions inside the response where links should be extracted from.
Has the same behaviour as ``restrict_xpaths``.
:type restrict_css: str or list

:param tags: a tag or a list of tags to consider when extracting links.
Defaults to ``('a', 'area')``.
:type tags: str or list
Expand Down
11 changes: 6 additions & 5 deletions scrapy/contrib/linkextractors/lxmlhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,18 @@ def _process_links(self, links):
class LxmlLinkExtractor(FilteringLinkExtractor):

def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
deny_extensions=None):
tags=('a', 'area'), attrs=('href',), canonicalize=True,
unique=True, process_value=None, deny_extensions=None, restrict_css=()):
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
tag_func = lambda x: x in tags
attr_func = lambda x: x in attrs
lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func,
unique=unique, process=process_value)

super(LxmlLinkExtractor, self).__init__(lx, allow, deny,
allow_domains, deny_domains, restrict_xpaths, canonicalize,
deny_extensions)
super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
allow_domains=allow_domains, deny_domains=deny_domains,
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
canonicalize=canonicalize, deny_extensions=deny_extensions)

def extract_links(self, response):
html = Selector(response)
Expand Down
11 changes: 6 additions & 5 deletions scrapy/contrib/linkextractors/sgml.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ def matches(self, url):
class SgmlLinkExtractor(FilteringLinkExtractor):

def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
deny_extensions=None):
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True,
process_value=None, deny_extensions=None, restrict_css=()):

warnings.warn(
"SgmlLinkExtractor is deprecated and will be removed in future releases. "
Expand All @@ -115,9 +115,10 @@ def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restric
lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
unique=unique, process_value=process_value)

super(SgmlLinkExtractor, self).__init__(lx, allow, deny,
allow_domains, deny_domains, restrict_xpaths, canonicalize,
deny_extensions)
super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
allow_domains=allow_domains, deny_domains=deny_domains,
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
canonicalize=canonicalize, deny_extensions=deny_extensions)

# FIXME: was added to fix a RegexLinkExtractor testcase
self.base_url = None
Expand Down
8 changes: 7 additions & 1 deletion scrapy/linkextractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
from six.moves.urllib.parse import urlparse

from scrapy.selector.csstranslator import ScrapyHTMLTranslator
from scrapy.utils.url import url_is_from_any_domain
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain, url_has_any_extension
from scrapy.utils.misc import arg_to_iter
Expand Down Expand Up @@ -38,8 +39,10 @@

class FilteringLinkExtractor(object):

_csstranslator = ScrapyHTMLTranslator()

def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
restrict_xpaths, canonicalize, deny_extensions):
restrict_xpaths, canonicalize, deny_extensions, restrict_css):

self.link_extractor = link_extractor

Expand All @@ -50,6 +53,9 @@ def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
self.deny_domains = set(arg_to_iter(deny_domains))

self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
arg_to_iter(restrict_css)))

self.canonicalize = canonicalize
if deny_extensions is None:
deny_extensions = IGNORED_EXTENSIONS
Expand Down
15 changes: 15 additions & 0 deletions tests/test_contrib_linkextractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,21 @@ def test_restrict_xpaths_concat_in_handle_data(self):
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
fragment='', nofollow=False)])

def test_restrict_css(self):
lx = self.extractor_cls(restrict_css=('#subwrapper a',))
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample2.html', text=u'sample 2')
])

def test_restrict_css_and_restrict_xpaths_together(self):
lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ),
restrict_css=('#subwrapper + a', ))
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
])

def test_area_tag_with_unicode_present(self):
body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
response = HtmlResponse("http://example.org", body=body, encoding='utf-8')
Expand Down