diff --git a/scrapy/spiders/sitemap.py b/scrapy/spiders/sitemap.py index eede467a83f..89d96c3302a 100644 --- a/scrapy/spiders/sitemap.py +++ b/scrapy/spiders/sitemap.py @@ -32,7 +32,7 @@ def start_requests(self): def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): - for url in sitemap_urls_from_robots(response.body): + for url in sitemap_urls_from_robots(response.text): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) diff --git a/tests/test_spider.py b/tests/test_spider.py index 4d5d4b07e49..1d22c1212df 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -328,6 +328,18 @@ def test_get_sitemap_body_xml_url_compressed(self): r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY) self.assertSitemapBody(r, self.BODY) + def test_get_sitemap_urls_from_robotstxt(self): + robots = b"""# Sitemap files +Sitemap: http://example.com/sitemap.xml +Sitemap: http://example.com/sitemap-product-index.xml +""" + + r = TextResponse(url="http://www.example.com/robots.txt", body=robots) + spider = self.spider_class("example.com") + self.assertEqual([req.url for req in spider._parse_sitemap(r)], + ['http://example.com/sitemap.xml', + 'http://example.com/sitemap-product-index.xml']) + class BaseSpiderDeprecationTest(unittest.TestCase):