From 8a717668ddc201f37cd1def3db85e0f67b602850 Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 14:33:14 -0700 Subject: [PATCH 01/12] Replicate original patch by pwirtz, bumping version number --- Doc/library/urllib.robotparser.rst | 11 +++++++++++ Lib/urllib/robotparser.py | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst index e3b90e673caaf0c..8fc2079991393ac 100644 --- a/Doc/library/urllib.robotparser.rst +++ b/Doc/library/urllib.robotparser.rst @@ -76,6 +76,17 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. .. versionadded:: 3.6 + .. method:: site_maps() + + Returns the contents of the ``Sitemap`` parameter from + ``robots.txt`` in the form of a :func:`list`. If there is no such + parameter or the ``robots.txt`` entry for this parameter has + invalid syntax, return ``None``. + + .. versionadded:: 3.8 + + + The following example demonstrates basic use of the :class:`RobotFileParser` class:: diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 92e4efe6865e1f8..7089916a4f81cc0 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -27,6 +27,7 @@ class RobotFileParser: def __init__(self, url=''): self.entries = [] + self.sitemaps = [] self.default_entry = None self.disallow_all = False self.allow_all = False @@ -141,6 +142,12 @@ def parse(self, lines): and numbers[1].strip().isdigit()): entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1])) state = 2 + elif line[0] == "sitemap": + # According to http://www.sitemaps.org/protocol.html + # "This directive is independent of the user-agent line, + # so it doesn't matter where you place it in your file." + # Therefore we do not change the state of the parser. + self.sitemaps.append(line[1]) if state == 2: self._add_entry(entry) @@ -189,6 +196,11 @@ def request_rate(self, useragent): return entry.req_rate return self.default_entry.req_rate + def site_maps(self): + if not self.sitemaps: + return None + return self.sitemaps + def __str__(self): entries = self.entries if self.default_entry is not None: From 834367e0d5515f132f06ac5abe11d5d451c1009f Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 14:45:50 -0700 Subject: [PATCH 02/12] Add unit test to test that robot parser correctly parses sitemaps --- Lib/test/test_robotparser.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index bee8d238be6b2fa..fc0cc810b325f1b 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -12,6 +12,7 @@ class BaseRobotTest: agent = 'test_robotparser' good = [] bad = [] + site_maps = None def setUp(self): lines = io.StringIO(self.robots_txt).readlines() @@ -36,6 +37,9 @@ def test_bad_urls(self): with self.subTest(url=url, agent=agent): self.assertFalse(self.parser.can_fetch(agent, url)) + def test_site_maps(self): + self.assertEqual(self.parser.site_maps(), self.site_maps) + class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ @@ -65,6 +69,23 @@ class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase): bad = ['/cyberworld/map/index.html'] +class SitemapTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +# robots.txt for http://www.example.com/ + +User-agent: * +Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml +Sitemap: http://www.google.com/hostednews/sitemap_index.xml +Request-rate: 3/15 +Disallow: /cyberworld/map/ # This is an infinite virtual URL space + + """ + good = ['/', '/test.html'] + bad = ['/cyberworld/map/index.html'] + site_maps = ["http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml", + "http://www.google.com/hostednews/sitemap_index.xml"] + + class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ # go away @@ -292,7 +313,7 @@ def setUp(self): # Short poll interval to make the test finish quickly. # Time between requests is short enough that we won't wake # up spuriously too many times. - kwargs={'poll_interval':0.01}) + kwargs={'poll_interval': 0.01}) self.t.daemon = True # In case this function raises. self.t.start() @@ -353,5 +374,6 @@ def test_read_404(self): self.assertIsNone(parser.crawl_delay('*')) self.assertIsNone(parser.request_rate('*')) -if __name__=='__main__': + +if __name__ == '__main__': unittest.main() From 72b53af28143213eeeffc8f5e30ec9b46148b2d0 Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 15:04:10 -0700 Subject: [PATCH 03/12] add news --- .../next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst new file mode 100644 index 000000000000000..705b41548f1db8b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -0,0 +1,2 @@ +Added support for optional Site Map extension to urllib robotparser. Patch +by Lady Red From 4be2015c46998d2db8bd0ebde6ea43dbcd7eab6d Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 21:11:41 -0700 Subject: [PATCH 04/12] CR from Mariatta --- Doc/library/urllib.robotparser.rst | 2 -- .../next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst index 8fc2079991393ac..544f50273dd17c4 100644 --- a/Doc/library/urllib.robotparser.rst +++ b/Doc/library/urllib.robotparser.rst @@ -86,8 +86,6 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. .. versionadded:: 3.8 - - The following example demonstrates basic use of the :class:`RobotFileParser` class:: diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst index 705b41548f1db8b..bd8b02b77447a99 100644 --- a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -1,2 +1,2 @@ Added support for optional Site Map extension to urllib robotparser. Patch -by Lady Red +by Lady Red, based on patch by Peter Wirtz. From 44114b3f5ae161e79c97ad250ba344e581c953b8 Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 21:14:29 -0700 Subject: [PATCH 05/12] Add to Ack --- Misc/ACKS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Misc/ACKS b/Misc/ACKS index 665b4dd7f43fb53..5c05ee7d5aa19f1 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -109,6 +109,7 @@ Anthony Baxter Mike Bayer Samuel L. Bayer Bo Bayles +Christopher Beacham AKA Lady Red Tommy Beadle Donald Beaudry David Beazley @@ -1760,6 +1761,7 @@ Dik Winter Blake Winton Jean-Claude Wippler Stéphane Wirtel +Peter Wirtz Lars Wirzenius John Wiseman Chris Withers From ee76a1171b4f8e763f7f18082fe9acf9303c6901 Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 21:16:54 -0700 Subject: [PATCH 06/12] Spacing --- .../next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst index bd8b02b77447a99..965f5a9e02b3af8 100644 --- a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -1,2 +1 @@ -Added support for optional Site Map extension to urllib robotparser. Patch -by Lady Red, based on patch by Peter Wirtz. +Added support for optional Site Map extension to urllib robotparser. Patch by Lady Red, based on patch by Peter Wirtz. From 7129e9b86e4b0c63413d225dae31deccdf3efb7f Mon Sep 17 00:00:00 2001 From: Mariatta Date: Wed, 16 May 2018 00:23:24 -0400 Subject: [PATCH 07/12] Update News entry file, limit to 80 chars. --- .../next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst index 965f5a9e02b3af8..3c5f479676ddbba 100644 --- a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -1 +1,2 @@ -Added support for optional Site Map extension to urllib robotparser. Patch by Lady Red, based on patch by Peter Wirtz. +Added support for optional Site Map extension to urllib robotparser. Patch by +Lady Red, based on patch by Peter Wirtz. From 35442c5958751c674b1c83e831f719e13e13389b Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 22:16:57 -0700 Subject: [PATCH 08/12] code review by berkerpeksag --- Lib/test/test_robotparser.py | 7 +++---- .../next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 3 ++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index fc0cc810b325f1b..3c23676467608ae 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -82,8 +82,8 @@ class SitemapTest(BaseRobotTest, unittest.TestCase): """ good = ['/', '/test.html'] bad = ['/cyberworld/map/index.html'] - site_maps = ["http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml", - "http://www.google.com/hostednews/sitemap_index.xml"] + site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml', + 'http://www.google.com/hostednews/sitemap_index.xml'] class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): @@ -313,7 +313,7 @@ def setUp(self): # Short poll interval to make the test finish quickly. # Time between requests is short enough that we won't wake # up spuriously too many times. - kwargs={'poll_interval': 0.01}) + kwargs={'poll_interval':0.01}) self.t.daemon = True # In case this function raises. self.t.start() @@ -374,6 +374,5 @@ def test_read_404(self): self.assertIsNone(parser.crawl_delay('*')) self.assertIsNone(parser.request_rate('*')) - if __name__ == '__main__': unittest.main() diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst index 965f5a9e02b3af8..074aa16dd9ead26 100644 --- a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -1 +1,2 @@ -Added support for optional Site Map extension to urllib robotparser. Patch by Lady Red, based on patch by Peter Wirtz. +Added support for Site Maps to urllib's RobotFileParser as :meth:`RobotFileParser.site_maps() `. +Patch by Lady Red, based on patch by Peter Wirtz. From 1791c4e2d8628a847b829dcdc5fe4bef2864cc16 Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 22:20:16 -0700 Subject: [PATCH 09/12] I think this will become 80 characters --- .../next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst index 074aa16dd9ead26..48b2bdab1dd4f6a 100644 --- a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -1,2 +1,2 @@ -Added support for Site Maps to urllib's RobotFileParser as :meth:`RobotFileParser.site_maps() `. -Patch by Lady Red, based on patch by Peter Wirtz. +Added support for Site Maps to urllib's RobotFileParser +as :meth:`RobotFileParser.site_maps() `. Patch by Lady Red, based on patch by Peter Wirtz. From 97d849128aa416197f907ec9ed1689c1458ed580 Mon Sep 17 00:00:00 2001 From: mcscope Date: Tue, 15 May 2018 22:25:04 -0700 Subject: [PATCH 10/12] This is more obviously 80 characters --- .../next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst index 48b2bdab1dd4f6a..5526fa652b8dbbf 100644 --- a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -1,2 +1,3 @@ -Added support for Site Maps to urllib's RobotFileParser -as :meth:`RobotFileParser.site_maps() `. Patch by Lady Red, based on patch by Peter Wirtz. +Added support for Site Maps to urllib's RobotFileParser as +:meth:`RobotFileParser.site_maps() `. +Patch by Lady Red, based on patch by Peter Wirtz. From 3e2aa658c9807472b014ccd60524e29e53d826f9 Mon Sep 17 00:00:00 2001 From: Berker Peksag Date: Wed, 16 May 2018 09:38:47 +0300 Subject: [PATCH 11/12] Update 2018-05-15-15-03-48.bpo-28612.E9dz39.rst --- .../next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst index 5526fa652b8dbbf..e3e8f16eef07f77 100644 --- a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -1,3 +1,3 @@ -Added support for Site Maps to urllib's RobotFileParser as +Added support for Site Maps to urllib's ``RobotFileParser`` as :meth:`RobotFileParser.site_maps() `. Patch by Lady Red, based on patch by Peter Wirtz. From 3e38354adadcb04ffd4bc252b88c67a0f9bc3a1c Mon Sep 17 00:00:00 2001 From: Berker Peksag Date: Wed, 16 May 2018 09:39:15 +0300 Subject: [PATCH 12/12] Update test_robotparser.py --- Lib/test/test_robotparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 3c23676467608ae..84a267ad9567ee7 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -374,5 +374,5 @@ def test_read_404(self): self.assertIsNone(parser.crawl_delay('*')) self.assertIsNone(parser.request_rate('*')) -if __name__ == '__main__': +if __name__=='__main__': unittest.main()