diff --git a/scrapy/linkextractors/htmlparser.py b/scrapy/linkextractors/htmlparser.py index 202340f538f..dcc261b319f 100644 --- a/scrapy/linkextractors/htmlparser.py +++ b/scrapy/linkextractors/htmlparser.py @@ -41,7 +41,10 @@ def _extract_links(self, response_text, response_url, response_encoding): for link in links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) - link.url = urljoin(base_url, link.url) + try: + link.url = urljoin(base_url, link.url) + except ValueError: + continue link.url = safe_url_string(link.url, response_encoding) link.text = link.text.decode(response_encoding) ret.append(link) diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py index 606a45212a1..e9fa521f392 100644 --- a/scrapy/linkextractors/lxmlhtml.py +++ b/scrapy/linkextractors/lxmlhtml.py @@ -49,10 +49,14 @@ def _extract_links(self, selector, response_url, response_encoding, base_url): # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) - attr_val = urljoin(base_url, attr_val) - url = self.process_attr(attr_val) - if url is None: - continue + try: + attr_val = urljoin(base_url, attr_val) + except ValueError: + continue # skipping bogus links + else: + url = self.process_attr(attr_val) + if url is None: + continue if isinstance(url, unicode): url = url.encode(response_encoding) # to fix relative links after process_value diff --git a/scrapy/linkextractors/regex.py b/scrapy/linkextractors/regex.py index 905eb89692a..b6f8d5d30dc 100644 --- a/scrapy/linkextractors/regex.py +++ b/scrapy/linkextractors/regex.py @@ -14,16 +14,25 @@ def clean_link(link_text): """Remove leading and trailing whitespace and punctuation""" return link_text.strip("\t\r\n '\"") + class RegexLinkExtractor(SgmlLinkExtractor): """High performant link extractor""" def _extract_links(self, response_text, response_url, response_encoding, base_url=None): + def clean_text(text): + return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip() + + def clean_url(url): + clean_url = '' + try: + clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding)))) + except ValueError: + pass + return clean_url + if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url - clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding)))) - clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() - links_text = linkre.findall(response_text) return [Link(clean_url(url).encode(response_encoding), clean_text(text)) diff --git a/scrapy/linkextractors/sgml.py b/scrapy/linkextractors/sgml.py index 4a6a24254ac..acecd9c0d59 100644 --- a/scrapy/linkextractors/sgml.py +++ b/scrapy/linkextractors/sgml.py @@ -42,7 +42,10 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur for link in self.links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) - link.url = urljoin(base_url, link.url) + try: + link.url = urljoin(base_url, link.url) + except ValueError: + continue link.url = safe_url_string(link.url, response_encoding) link.text = to_unicode(link.text, response_encoding, errors='replace').strip() ret.append(link) diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py index 948289f8f89..d78b25f2580 100644 --- a/tests/test_linkextractors.py +++ b/tests/test_linkextractors.py @@ -491,10 +491,36 @@ def test_xhtml(self): Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)] ) + def test_link_wrong_href(self): + html = """ + Item 1 + Item 2 + Item 3 + """ + response = HtmlResponse("http://example.org/index.html", body=html) + lx = self.extractor_cls() + self.assertEqual([link for link in lx.extract_links(response)], [ + Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False), + Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False), + ]) + class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase): extractor_cls = LxmlLinkExtractor + def test_link_wrong_href(self): + html = """ + Item 1 + Item 2 + Item 3 + """ + response = HtmlResponse("http://example.org/index.html", body=html) + lx = self.extractor_cls() + self.assertEqual([link for link in lx.extract_links(response)], [ + Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False), + Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False), + ]) + class HtmlParserLinkExtractorTestCase(unittest.TestCase): @@ -512,6 +538,19 @@ def test_extraction(self): Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'),]) + def test_link_wrong_href(self): + html = """ + Item 1 + Item 2 + Item 3 + """ + response = HtmlResponse("http://example.org/index.html", body=html) + lx = HtmlParserLinkExtractor() + self.assertEqual([link for link in lx.extract_links(response)], [ + Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False), + Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False), + ]) + class RegexLinkExtractorTestCase(unittest.TestCase): @@ -528,6 +567,19 @@ def test_extraction(self): Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'),]) + def test_link_wrong_href(self): + html = """ + Item 1 + Item 2 + Item 3 + """ + response = HtmlResponse("http://example.org/index.html", body=html) + lx = RegexLinkExtractor() + self.assertEqual([link for link in lx.extract_links(response)], [ + Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False), + Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False), + ]) + if __name__ == "__main__": unittest.main()