Skip to content

Commit

Permalink
Fix HtmlParserLinkExtractor and tests after #485 merge
Browse files Browse the repository at this point in the history
  • Loading branch information
redapple authored and dangra committed Feb 5, 2014
1 parent b566388 commit 368a946
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
7 changes: 4 additions & 3 deletions scrapy/contrib/linkextractors/htmlparser.py
Expand Up @@ -62,11 +62,12 @@ def handle_starttag(self, tag, attrs):
self.current_link = link

def handle_endtag(self, tag):
self.current_link = None
if self.scan_tag(tag):
self.current_link = None

def handle_data(self, data):
if self.current_link and not self.current_link.text:
self.current_link.text = data.strip()
if self.current_link:
self.current_link.text = self.current_link.text + data

def matches(self, url):
"""This extractor matches with any url, since
Expand Down
6 changes: 4 additions & 2 deletions scrapy/tests/test_contrib_linkextractors.py
Expand Up @@ -317,7 +317,8 @@ def test_extraction(self):
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
Link(url='http://www.google.com/something', text=u''),])
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])


class RegexLinkExtractorTestCase(unittest.TestCase):
Expand All @@ -332,7 +333,8 @@ def test_extraction(self):
self.assertEqual(lx.extract_links(self.response),
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),])
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])


if __name__ == "__main__":
Expand Down

0 comments on commit 368a946

Please sign in to comment.