Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed default value of attrs argument in SgmlLinkExtractor to be tuple #661

Merged
merged 1 commit into from
Mar 27, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/topics/link-extractors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,9 @@ SgmlLinkExtractor
domains which won't be considered for extracting the links
:type deny_domains: str or list

:param deny_extensions: a list of extensions that should be ignored when
extracting links. If not given, it will default to the
:param deny_extensions: a single value or list of strings containing
extensions that should be ignored when extracting links.
If not given, it will default to the
``IGNORED_EXTENSIONS`` list defined in the `scrapy.linkextractor`_
module.
:type deny_extensions: list
Expand All @@ -85,7 +86,7 @@ SgmlLinkExtractor
Defaults to ``('a', 'area')``.
:type tags: str or list

:param attrs: list of attributes which should be considered when looking
:param attrs: an attribute or list of attributes which should be considered when looking
for links to extract (only for those tags specified in the ``tags``
parameter). Defaults to ``('href',)``
:type attrs: list
Expand Down
8 changes: 4 additions & 4 deletions scrapy/contrib/linkextractors/sgml.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def matches(self, url):
class SgmlLinkExtractor(BaseSgmlLinkExtractor):

def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
deny_extensions=None):
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
Expand All @@ -105,9 +105,9 @@ def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restric
self.canonicalize = canonicalize
if deny_extensions is None:
deny_extensions = IGNORED_EXTENSIONS
self.deny_extensions = set(['.' + e for e in deny_extensions])
tag_func = lambda x: x in tags
attr_func = lambda x: x in attrs
self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
tag_func = lambda x: x in arg_to_iter(tags)
attr_func = lambda x: x in arg_to_iter(attrs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect that if you remove arg_to_iter and make attrs=('href') again then new tests will still pass because 'href' in 'href' is True. We may e.g. add an element with "ref" attribute to the testing suite to check that condition, but I'm fine with merging without this fix.

BaseSgmlLinkExtractor.__init__(self,
tag=tag_func,
attr=attr_func,
Expand Down
66 changes: 66 additions & 0 deletions scrapy/tests/test_contrib_linkextractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,11 @@ def test_deny_extensions(self):
Link(url='http://example.org/page.html', text=u'asd'),
])

lx = SgmlLinkExtractor(deny_extensions="jpg")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.org/page.html', text=u'asd'),
])

def test_process_value(self):
"""Test restrict_xpaths with encodings"""
html = """
Expand Down Expand Up @@ -304,6 +309,67 @@ def test_base_url_with_restrict_xpaths(self):
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])


def test_attrs(self):
lx = SgmlLinkExtractor(attrs="href")
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])

lx = SgmlLinkExtractor(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])

lx = SgmlLinkExtractor(attrs=None)
self.assertEqual(lx.extract_links(self.response), [])

html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = SgmlLinkExtractor(attrs=("href"))
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
])


def test_tags(self):
html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
response = HtmlResponse("http://example.com/index.html", body=html)

lx = SgmlLinkExtractor(tags=None)
self.assertEqual(lx.extract_links(response), [])

lx = SgmlLinkExtractor()
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])

lx = SgmlLinkExtractor(tags="area")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
])

lx = SgmlLinkExtractor(tags="a")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])

lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),
])


class HtmlParserLinkExtractorTestCase(unittest.TestCase):

def setUp(self):
Expand Down