Skip to content
Permalink
Browse files

bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl …

…delay or request rate (GH-11791)

Co-Authored-By: Tal Einat <taleinat+github@gmail.com>
(cherry picked from commit 8047e0e)

Co-authored-by: Rémi Lapeyre <remi.lapeyre@henki.fr>
  • Loading branch information
miss-islington and remilapeyre committed Jun 16, 2019
1 parent 159ae24 commit 45d6547acfb9ae1639adbe03dd14f38cd0642ca2
@@ -76,30 +76,38 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):


class BaseRequestRateTest(BaseRobotTest):
request_rate = None
crawl_delay = None

def test_request_rate(self):
parser = self.parser
for url in self.good + self.bad:
agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent):
if self.crawl_delay:
self.assertEqual(
self.parser.crawl_delay(agent), self.crawl_delay
)
if self.request_rate:
self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)

parsed_request_rate = parser.request_rate(agent)
self.assertEqual(parsed_request_rate, self.request_rate)
if self.request_rate is not None:
self.assertIsInstance(
self.parser.request_rate(agent),
parsed_request_rate,
urllib.robotparser.RequestRate
)
self.assertEqual(
self.parser.request_rate(agent).requests,
parsed_request_rate.requests,
self.request_rate.requests
)
self.assertEqual(
self.parser.request_rate(agent).seconds,
parsed_request_rate.seconds,
self.request_rate.seconds
)


class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = ''
good = ['/foo']


class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = """\
User-agent: figtree
@@ -120,10 +128,6 @@ class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):

class DifferentAgentTest(CrawlDelayAndRequestRateTest):
agent = 'FigTree Robot libwww-perl/5.04'
# these are not actually tested, but we still need to parse it
# in order to accommodate the input parameters
request_rate = None
crawl_delay = None


class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
@@ -179,15 +179,19 @@ def crawl_delay(self, useragent):
for entry in self.entries:
if entry.applies_to(useragent):
return entry.delay
return self.default_entry.delay
if self.default_entry:
return self.default_entry.delay
return None

def request_rate(self, useragent):
if not self.mtime():
return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.req_rate
return self.default_entry.req_rate
if self.default_entry:
return self.default_entry.req_rate
return None

def __str__(self):
entries = self.entries
@@ -0,0 +1,4 @@
Fix :meth:`RobotFileParser.crawl_delay` and
:meth:`RobotFileParser.request_rate` to return ``None`` rather than
raise :exc:`AttributeError` when no relevant rule is defined in the
robots.txt file. Patch by Rémi Lapeyre.

0 comments on commit 45d6547

Please sign in to comment.
You can’t perform that action at this time.