[LinkExtractors] Ignore bogus links

Scorpil · redapple · commit 108195e780a5 · 2016-01-11T15:42:56.000+01:00
(rebased the code for scrapy 1.0 and made a few code improvements --nyov)
diff --git a/scrapy/linkextractors/htmlparser.py b/scrapy/linkextractors/htmlparser.py
@@ -41,7 +41,10 @@ def _extract_links(self, response_text, response_url, response_encoding):
         for link in links:
             if isinstance(link.url, unicode):
                 link.url = link.url.encode(response_encoding)
-            link.url = urljoin(base_url, link.url)
+            try:
+                link.url = urljoin(base_url, link.url)
+            except ValueError:
+                continue
             link.url = safe_url_string(link.url, response_encoding)
             link.text = link.text.decode(response_encoding)
             ret.append(link)
diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
@@ -49,10 +49,14 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
         # hacky way to get the underlying lxml parsed document
         for el, attr, attr_val in self._iter_links(selector._root):
             # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
-            attr_val = urljoin(base_url, attr_val)
-            url = self.process_attr(attr_val)
-            if url is None:
-                continue
+            try:
+                attr_val = urljoin(base_url, attr_val)
+            except ValueError:
+                continue # skipping bogus links
+            else:
+                url = self.process_attr(attr_val)
+                if url is None:
+                    continue
             if isinstance(url, unicode):
                 url = url.encode(response_encoding)
             # to fix relative links after process_value
diff --git a/scrapy/linkextractors/regex.py b/scrapy/linkextractors/regex.py
@@ -14,16 +14,25 @@ def clean_link(link_text):
     """Remove leading and trailing whitespace and punctuation"""
     return link_text.strip("\t\r\n '\"")
 
+
 class RegexLinkExtractor(SgmlLinkExtractor):
     """High performant link extractor"""
 
     def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
+        def clean_text(text):
+            return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()
+
+        def clean_url(url):
+            clean_url = ''
+            try:
+                clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
+            except ValueError:
+                pass
+            return clean_url
+
         if base_url is None:
             base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
 
-        clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
-        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
-
         links_text = linkre.findall(response_text)
         return [Link(clean_url(url).encode(response_encoding),
                      clean_text(text))
diff --git a/scrapy/linkextractors/sgml.py b/scrapy/linkextractors/sgml.py
@@ -42,7 +42,10 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
         for link in self.links:
             if isinstance(link.url, unicode):
                 link.url = link.url.encode(response_encoding)
-            link.url = urljoin(base_url, link.url)
+            try:
+                link.url = urljoin(base_url, link.url)
+            except ValueError:
+                continue
             link.url = safe_url_string(link.url, response_encoding)
             link.text = str_to_unicode(link.text, response_encoding, errors='replace').strip()
             ret.append(link)
diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py
@@ -491,10 +491,36 @@ def test_xhtml(self):
                           Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
                         )
 
+    def test_link_wrong_href(self):
+        html = """
+        <a href="http://example.org/item1.html">Item 1</a>
+        <a href="http://[example.org/item2.html">Item 2</a>
+        <a href="http://example.org/item3.html">Item 3</a>
+        """
+        response = HtmlResponse("http://example.org/index.html", body=html)
+        lx = self.extractor_cls()
+        self.assertEqual([link for link in lx.extract_links(response)], [
+            Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
+            Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
+        ])
+
 
 class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase):
     extractor_cls = LxmlLinkExtractor
 
+    def test_link_wrong_href(self):
+        html = """
+        <a href="http://example.org/item1.html">Item 1</a>
+        <a href="http://[example.org/item2.html">Item 2</a>
+        <a href="http://example.org/item3.html">Item 3</a>
+        """
+        response = HtmlResponse("http://example.org/index.html", body=html)
+        lx = self.extractor_cls()
+        self.assertEqual([link for link in lx.extract_links(response)], [
+            Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
+            Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
+        ])
+
 
 class HtmlParserLinkExtractorTestCase(unittest.TestCase):
 
@@ -512,6 +538,19 @@ def test_extraction(self):
                           Link(url='http://www.google.com/something', text=u''),
                           Link(url='http://example.com/innertag.html', text=u'inner tag'),])
 
+    def test_link_wrong_href(self):
+        html = """
+        <a href="http://example.org/item1.html">Item 1</a>
+        <a href="http://[example.org/item2.html">Item 2</a>
+        <a href="http://example.org/item3.html">Item 3</a>
+        """
+        response = HtmlResponse("http://example.org/index.html", body=html)
+        lx = HtmlParserLinkExtractor()
+        self.assertEqual([link for link in lx.extract_links(response)], [
+            Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
+            Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
+        ])
+
 
 class RegexLinkExtractorTestCase(unittest.TestCase):
 
@@ -528,6 +567,19 @@ def test_extraction(self):
                           Link(url='http://www.google.com/something', text=u''),
                           Link(url='http://example.com/innertag.html', text=u'inner tag'),])
 
+    def test_link_wrong_href(self):
+        html = """
+        <a href="http://example.org/item1.html">Item 1</a>
+        <a href="http://[example.org/item2.html">Item 2</a>
+        <a href="http://example.org/item3.html">Item 3</a>
+        """
+        response = HtmlResponse("http://example.org/index.html", body=html)
+        lx = RegexLinkExtractor()
+        self.assertEqual([link for link in lx.extract_links(response)], [
+            Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
+            Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
+        ])
+
 
 if __name__ == "__main__":
     unittest.main()