Skip to content
Permalink
Browse files

Fix HtmlParserLinkExtractor and tests after #485 merge

  • Loading branch information
redapple authored and dangra committed Feb 5, 2014
1 parent b566388 commit 368a9467aa7502c5d5febbfa8c85e7da061d1879
Showing with 8 additions and 5 deletions.
  1. +4 −3 scrapy/contrib/linkextractors/htmlparser.py
  2. +4 −2 scrapy/tests/test_contrib_linkextractors.py
@@ -62,11 +62,12 @@ def handle_starttag(self, tag, attrs):
self.current_link = link

def handle_endtag(self, tag):
self.current_link = None
if self.scan_tag(tag):
self.current_link = None

def handle_data(self, data):
if self.current_link and not self.current_link.text:
self.current_link.text = data.strip()
if self.current_link:
self.current_link.text = self.current_link.text + data

def matches(self, url):
"""This extractor matches with any url, since
@@ -317,7 +317,8 @@ def test_extraction(self):
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
Link(url='http://www.google.com/something', text=u''),])
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])


class RegexLinkExtractorTestCase(unittest.TestCase):
@@ -332,7 +333,8 @@ def test_extraction(self):
self.assertEqual(lx.extract_links(self.response),
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),])
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])


if __name__ == "__main__":

0 comments on commit 368a946

Please sign in to comment.
You can’t perform that action at this time.