Fix HtmlParserLinkExtractor and tests after #485 merge

scrapy · Feb 5, 2014 · 368a946 · 368a946
1 parent b566388
commit 368a946
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 5 deletions.
diff --git a/scrapy/contrib/linkextractors/htmlparser.py b/scrapy/contrib/linkextractors/htmlparser.py
@@ -62,11 +62,12 @@ def handle_starttag(self, tag, attrs):
                     self.current_link = link
 
     def handle_endtag(self, tag):
-        self.current_link = None
+        if self.scan_tag(tag):
+            self.current_link = None
 
     def handle_data(self, data):
-        if self.current_link and not self.current_link.text:
-            self.current_link.text = data.strip()
+        if self.current_link:
+            self.current_link.text = self.current_link.text + data
 
     def matches(self, url):
         """This extractor matches with any url, since

diff --git a/scrapy/tests/test_contrib_linkextractors.py b/scrapy/tests/test_contrib_linkextractors.py
@@ -317,7 +317,8 @@ def test_extraction(self):
                          [Link(url='http://example.com/sample2.html', text=u'sample 2'),
                           Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
                           Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
-                          Link(url='http://www.google.com/something', text=u''),])
+                          Link(url='http://www.google.com/something', text=u''),
+                          Link(url='http://example.com/innertag.html', text=u'inner tag'),])
 
 
 class RegexLinkExtractorTestCase(unittest.TestCase):
@@ -332,7 +333,8 @@ def test_extraction(self):
         self.assertEqual(lx.extract_links(self.response),
                          [Link(url='http://example.com/sample2.html', text=u'sample 2'),
                           Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
-                          Link(url='http://www.google.com/something', text=u''),])
+                          Link(url='http://www.google.com/something', text=u''),
+                          Link(url='http://example.com/innertag.html', text=u'inner tag'),])
 
 
 if __name__ == "__main__":