scrapy · pablohoffman · Mar 17, 2015 · Dec 11, 2014 · Dec 11, 2014 · Dec 11, 2014
diff --git a/docs/topics/link-extractors.rst b/docs/topics/link-extractors.rst
@@ -51,7 +51,7 @@ LxmlLinkExtractor
    :synopsis: lxml's HTMLParser-based link extractors
 
 
-.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
+.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
 
     LxmlLinkExtractor is the recommended link extractor with handy filtering
     options. It is implemented using lxml's robust HTMLParser.
@@ -88,6 +88,11 @@ LxmlLinkExtractor
         links. See examples below.
     :type restrict_xpaths: str or list
 
+    :param restrict_css: a CSS selector (or list of selectors) which defines
+        regions inside the response where links should be extracted from.
+        Has the same behaviour as ``restrict_xpaths``.
+    :type restrict_css: str or list
+
     :param tags: a tag or a list of tags to consider when extracting links.
         Defaults to ``('a', 'area')``.
     :type tags: str or list

diff --git a/scrapy/contrib/linkextractors/lxmlhtml.py b/scrapy/contrib/linkextractors/lxmlhtml.py
@@ -81,17 +81,18 @@ def _process_links(self, links):
 class LxmlLinkExtractor(FilteringLinkExtractor):
 
     def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
-                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
-                 deny_extensions=None):
+                 tags=('a', 'area'), attrs=('href',), canonicalize=True,
+                 unique=True, process_value=None, deny_extensions=None, restrict_css=()):
         tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
         tag_func = lambda x: x in tags
         attr_func = lambda x: x in attrs
         lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func,
             unique=unique, process=process_value)
 
-        super(LxmlLinkExtractor, self).__init__(lx, allow, deny,
-            allow_domains, deny_domains, restrict_xpaths, canonicalize,
-            deny_extensions)
+        super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
+            allow_domains=allow_domains, deny_domains=deny_domains,
+            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
+            canonicalize=canonicalize, deny_extensions=deny_extensions)
 
     def extract_links(self, response):
         html = Selector(response)

diff --git a/scrapy/contrib/linkextractors/sgml.py b/scrapy/contrib/linkextractors/sgml.py
@@ -98,8 +98,8 @@ def matches(self, url):
 class SgmlLinkExtractor(FilteringLinkExtractor):
 
     def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
-                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
-                 deny_extensions=None):
+                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True,
+                 process_value=None, deny_extensions=None, restrict_css=()):
 
         warnings.warn(
             "SgmlLinkExtractor is deprecated and will be removed in future releases. "
@@ -115,9 +115,10 @@ def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restric
             lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
                 unique=unique, process_value=process_value)
 
-        super(SgmlLinkExtractor, self).__init__(lx, allow, deny,
-            allow_domains, deny_domains, restrict_xpaths, canonicalize,
-            deny_extensions)
+        super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
+            allow_domains=allow_domains, deny_domains=deny_domains,
+            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
+            canonicalize=canonicalize, deny_extensions=deny_extensions)
 
         # FIXME: was added to fix a RegexLinkExtractor testcase
         self.base_url = None

diff --git a/scrapy/linkextractor.py b/scrapy/linkextractor.py
@@ -5,6 +5,7 @@
 import re
 from six.moves.urllib.parse import urlparse
 
+from scrapy.selector.csstranslator import ScrapyHTMLTranslator
 from scrapy.utils.url import url_is_from_any_domain
 from scrapy.utils.url import canonicalize_url, url_is_from_any_domain, url_has_any_extension
 from scrapy.utils.misc import arg_to_iter
@@ -38,8 +39,10 @@
 
 class FilteringLinkExtractor(object):
 
+    _csstranslator = ScrapyHTMLTranslator()
+
     def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
-                 restrict_xpaths, canonicalize, deny_extensions):
+                 restrict_xpaths, canonicalize, deny_extensions, restrict_css):
 
         self.link_extractor = link_extractor
 
@@ -50,6 +53,9 @@ def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
         self.deny_domains = set(arg_to_iter(deny_domains))
 
         self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
+        self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
+                                          arg_to_iter(restrict_css)))
+
         self.canonicalize = canonicalize
         if deny_extensions is None:
             deny_extensions = IGNORED_EXTENSIONS

diff --git a/tests/test_contrib_linkextractors.py b/tests/test_contrib_linkextractors.py
@@ -284,6 +284,21 @@ def test_restrict_xpaths_concat_in_handle_data(self):
                          [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
                                fragment='', nofollow=False)])
 
+    def test_restrict_css(self):
+        lx = self.extractor_cls(restrict_css=('#subwrapper a',))
+        self.assertEqual(lx.extract_links(self.response), [
+            Link(url='http://example.com/sample2.html', text=u'sample 2')
+        ])
+
+    def test_restrict_css_and_restrict_xpaths_together(self):
+        lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ),
+                                restrict_css=('#subwrapper + a', ))
+        self.assertEqual([link for link in lx.extract_links(self.response)], [
+            Link(url='http://example.com/sample1.html', text=u''),
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
+        ])
+
     def test_area_tag_with_unicode_present(self):
         body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
         response = HtmlResponse("http://example.org", body=body, encoding='utf-8')