Merge branch 'master' into py3-linkextractors

Conflicts: scrapy/linkextractors/lxmlhtml.py tests/test_linkextractors.py
scrapy · Aug 27, 2015 · 9bfe6ec · 9bfe6ec
2 parents ff24cbb + aa31811
commit 9bfe6ec
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 6 deletions.
diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
@@ -7,7 +7,7 @@
 import lxml.etree as etree
 
 from scrapy.link import Link
-from scrapy.utils.misc import arg_to_iter
+from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
 from scrapy.utils.python import unique as unique_list, to_native_str
 from scrapy.linkextractors import FilteringLinkExtractor
 from scrapy.utils.response import get_base_url
@@ -60,7 +60,7 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
             # to fix relative links after process_value
             url = urljoin(response_url, url)
             link = Link(url, _collect_string_content(el) or u'',
-                nofollow=True if el.get('rel') == 'nofollow' else False)
+                        nofollow=rel_has_nofollow(el.get('rel')))
             links.append(link)
         return self._deduplicate_if_needed(links)
 

diff --git a/scrapy/linkextractors/sgml.py b/scrapy/linkextractors/sgml.py
@@ -9,7 +9,7 @@
 from scrapy.selector import Selector
 from scrapy.link import Link
 from scrapy.linkextractors import FilteringLinkExtractor
-from scrapy.utils.misc import arg_to_iter
+from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
 from scrapy.utils.python import unique as unique_list, to_unicode
 from scrapy.utils.response import get_base_url
 from scrapy.exceptions import ScrapyDeprecationWarning
@@ -80,7 +80,7 @@ def unknown_starttag(self, tag, attrs):
                 if self.scan_attr(attr):
                     url = self.process_value(value)
                     if url is not None:
-                        link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
+                        link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
                         self.links.append(link)
                         self.current_link = link
 

diff --git a/scrapy/utils/misc.py b/scrapy/utils/misc.py
@@ -112,3 +112,8 @@ def md5sum(file):
             break
         m.update(d)
     return m.hexdigest()
+
+def rel_has_nofollow(rel):
+    """Return True if link rel attribute has nofollow type"""
+    return True if rel is not None and 'nofollow' in rel.split() else False
+
diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py
@@ -112,6 +112,9 @@ def test_nofollow(self):
             <div>
             <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
             </div>
+            <div>
+            <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
+            </div>
             </body></html>"""
             response = HtmlResponse("http://example.org/somepage/index.html", body=html)
 
@@ -121,6 +124,7 @@ def test_nofollow(self):
                 Link(url='http://example.org/follow.html', text=u'Follow this link'),
                 Link(url='http://example.org/nofollow.html', text=u'Dont follow this one', nofollow=True),
                 Link(url='http://example.org/nofollow2.html', text=u'Choose to follow or not'),
+                Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True),
             ])
 
         def test_matches(self):
@@ -369,6 +373,9 @@ def test_xhtml(self):
         <div>
         <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
         </div>
+        <div>
+        <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
+        </div>
     </body>
     </html>
             """
@@ -380,7 +387,8 @@ def test_xhtml(self):
                              [Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
                               Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
                               Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
-                              Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
+                              Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
+                              Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
                             )
 
             response = XmlResponse("http://example.com/index.xhtml", body=xhtml)
@@ -390,7 +398,8 @@ def test_xhtml(self):
                              [Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
                               Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
                               Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
-                              Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
+                              Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
+                              Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
                             )
 
         def test_link_wrong_href(self):

diff --git a/tests/test_linkextractors_deprecated.py b/tests/test_linkextractors_deprecated.py
@@ -149,12 +149,14 @@ def test_link_nofollow(self):
         html = """
         <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
         <a href="about.html">About us</a>
+        <a href="http://google.com/something" rel="external nofollow">Something</a>
         """
         response = HtmlResponse("http://example.org/page.html", body=html)
         lx = SgmlLinkExtractor()
         self.assertEqual([link for link in lx.extract_links(response)], [
             Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
             Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
+            Link(url='http://google.com/something', text=u'Something', nofollow=True),
         ])