Skip to content
Permalink
Browse files
[LinkExtractors] Ignore bogus links
(rebased the code for scrapy 1.0 and made a few code improvements --nyov)
  • Loading branch information
Scorpil authored and redapple committed Jan 11, 2016
1 parent b262411 commit 108195e780a5ac183a1afcf6893b6baeaf0b2dac
@@ -41,7 +41,10 @@ def _extract_links(self, response_text, response_url, response_encoding):
for link in links:
if isinstance(link.url, unicode):
link.url = link.url.encode(response_encoding)
link.url = urljoin(base_url, link.url)
try:
link.url = urljoin(base_url, link.url)
except ValueError:
continue
link.url = safe_url_string(link.url, response_encoding)
link.text = link.text.decode(response_encoding)
ret.append(link)
@@ -49,10 +49,14 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector._root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
attr_val = urljoin(base_url, attr_val)
url = self.process_attr(attr_val)
if url is None:
continue
try:
attr_val = urljoin(base_url, attr_val)
except ValueError:
continue # skipping bogus links
else:
url = self.process_attr(attr_val)
if url is None:
continue
if isinstance(url, unicode):
url = url.encode(response_encoding)
# to fix relative links after process_value
@@ -14,16 +14,25 @@ def clean_link(link_text):
"""Remove leading and trailing whitespace and punctuation"""
return link_text.strip("\t\r\n '\"")


class RegexLinkExtractor(SgmlLinkExtractor):
"""High performant link extractor"""

def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
def clean_text(text):
return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()

def clean_url(url):
clean_url = ''
try:
clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
except ValueError:
pass
return clean_url

if base_url is None:
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

links_text = linkre.findall(response_text)
return [Link(clean_url(url).encode(response_encoding),
clean_text(text))
@@ -42,7 +42,10 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
for link in self.links:
if isinstance(link.url, unicode):
link.url = link.url.encode(response_encoding)
link.url = urljoin(base_url, link.url)
try:
link.url = urljoin(base_url, link.url)
except ValueError:
continue
link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding, errors='replace').strip()
ret.append(link)
@@ -491,10 +491,36 @@ def test_xhtml(self):
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
)

def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = self.extractor_cls()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])


class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase):
extractor_cls = LxmlLinkExtractor

def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = self.extractor_cls()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])


class HtmlParserLinkExtractorTestCase(unittest.TestCase):

@@ -512,6 +538,19 @@ def test_extraction(self):
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])

def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = HtmlParserLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])


class RegexLinkExtractorTestCase(unittest.TestCase):

@@ -528,6 +567,19 @@ def test_extraction(self):
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])

def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = RegexLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])


if __name__ == "__main__":
unittest.main()

0 comments on commit 108195e

Please sign in to comment.