Merge pull request #1767 from orangain/sitemap-robotstxt

[MRG+1] PY3: Fix SitemapSpider to extract sitemap urls from robots.txt properly
scrapy · Feb 8, 2016 · 44bc4c0 · 44bc4c0
2 parents e328a9b + 25c5615
commit 44bc4c0
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 1 deletion.
diff --git a/scrapy/spiders/sitemap.py b/scrapy/spiders/sitemap.py
@@ -32,7 +32,7 @@ def start_requests(self):
 
     def _parse_sitemap(self, response):
         if response.url.endswith('/robots.txt'):
-            for url in sitemap_urls_from_robots(response.body):
+            for url in sitemap_urls_from_robots(response.text):
                 yield Request(url, callback=self._parse_sitemap)
         else:
             body = self._get_sitemap_body(response)

diff --git a/tests/test_spider.py b/tests/test_spider.py
@@ -328,6 +328,18 @@ def test_get_sitemap_body_xml_url_compressed(self):
         r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY)
         self.assertSitemapBody(r, self.BODY)
 
+    def test_get_sitemap_urls_from_robotstxt(self):
+        robots = b"""# Sitemap files
+Sitemap: http://example.com/sitemap.xml
+Sitemap: http://example.com/sitemap-product-index.xml
+"""
+
+        r = TextResponse(url="http://www.example.com/robots.txt", body=robots)
+        spider = self.spider_class("example.com")
+        self.assertEqual([req.url for req in spider._parse_sitemap(r)],
+                         ['http://example.com/sitemap.xml',
+                          'http://example.com/sitemap-product-index.xml'])
+
 
 class BaseSpiderDeprecationTest(unittest.TestCase):