From 80991af3a6a02a6cfd2a0f8f97d19951e3506935 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 4 May 2026 22:52:29 +0300 Subject: [PATCH] gh-149381: Optimize robotparser for long list of rules --- Lib/test/test_robotparser.py | 6 +++--- Lib/urllib/robotparser.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 3ea0ec66fbfbe9..2ce916fedc2708 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -388,8 +388,8 @@ class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase): expected_output = """\ User-agent: spambot User-agent: eggsbot -Disallow: /some/path -Disallow: /another/path\ +Disallow: /another/path +Disallow: /some/path\ """ @@ -445,10 +445,10 @@ class WeirdPathTest(BaseRobotTest, unittest.TestCase): '/e$$', '/ex$y$', '/g'] expected_output = """\ User-agent: * -Disallow: /a$ Disallow: /c* Disallow: /d*z Disallow: /e*$ +Disallow: /a$ Disallow: /g$\ """ diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index e70eae80036784..13e016ff74c2ee 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -81,6 +81,7 @@ def _add_entry(self, entry): self.groups[agent] = entry else: self.groups[agent] = merge_entries(self.groups[agent], entry) + sort_rulelines(self.groups[agent].rulelines) def parse(self, lines): """Parse the input lines from a robots.txt file. @@ -305,6 +306,9 @@ def allowance(self, filename): """Preconditions: - our agent applies to this entry - filename is URL encoded + - rules are sorted: + - wildcards before literal paths + - literal paths from longest to shortest, "Allow" before "Disallow" """ best_match = -1 allowance = True @@ -316,6 +320,9 @@ def allowance(self, filename): allowance = line.allowance elif m == best_match and not allowance: allowance = line.allowance + # Optimization. Requires rules to be sorted. + if line.matcher is None and (m or len(line.path) + 1 < best_match): + break return allowance @@ -353,3 +360,11 @@ def merge_entries(e1, e2): entry.delay = e1.delay if e2.delay is None else e2.delay entry.req_rate = e1.req_rate if e2.req_rate is None else e2.req_rate return entry + +def sort_rulelines(rulelines): + def sortkey(line): + if line.matcher is not None: + return (True,) + else: + return (False, len(line.path), line.allowance) + rulelines.sort(key=sortkey, reverse=True)