Skip to content

Commit

Permalink
Merge pull request #1767 from orangain/sitemap-robotstxt
Browse files Browse the repository at this point in the history
[MRG+1] PY3: Fix SitemapSpider to extract sitemap urls from robots.txt properly
  • Loading branch information
kmike committed Feb 8, 2016
2 parents e328a9b + 25c5615 commit 44bc4c0
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 1 deletion.
2 changes: 1 addition & 1 deletion scrapy/spiders/sitemap.py
Expand Up @@ -32,7 +32,7 @@ def start_requests(self):

def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
for url in sitemap_urls_from_robots(response.text):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_spider.py
Expand Up @@ -328,6 +328,18 @@ def test_get_sitemap_body_xml_url_compressed(self):
r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY)
self.assertSitemapBody(r, self.BODY)

def test_get_sitemap_urls_from_robotstxt(self):
robots = b"""# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
"""

r = TextResponse(url="http://www.example.com/robots.txt", body=robots)
spider = self.spider_class("example.com")
self.assertEqual([req.url for req in spider._parse_sitemap(r)],
['http://example.com/sitemap.xml',
'http://example.com/sitemap-product-index.xml'])


class BaseSpiderDeprecationTest(unittest.TestCase):

Expand Down

0 comments on commit 44bc4c0

Please sign in to comment.