Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 141 additions & 20 deletions Lib/test/test_robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ class BaseRobotTest:
bad = []
site_maps = None

def __init_subclass__(cls):
super().__init_subclass__()
# Remove tests that do nothing.
if not cls.good:
cls.test_good_urls = None
if not cls.bad:
cls.test_bad_urls = None

def setUp(self):
lines = io.StringIO(self.robots_txt).readlines()
self.parser = urllib.robotparser.RobotFileParser()
Expand Down Expand Up @@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /some/path?name=value
Disallow: /another/path?
Disallow: /yet/one/path?name=value&more
"""
good = ['/some/path']
bad = ['/some/path?name=value']
good = ['/some/path', '/some/path?',
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
'/another/path', '/another/path%3F',
'/yet/one/path?name=value%26more']
bad = ['/some/path?name=value'
'/another/path?', '/another/path?name=value',
'/yet/one/path?name=value&more']


class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
Expand All @@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
bad = ['/some/path']


class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
# normalize the URL first (#17403)
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Allow: /some/path?
Disallow: /another/path?
"""
good = ['/some/path?']
bad = ['/another/path?']
Disallow: /a1/Z-._~ # unreserved characters
Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
Disallow: /u2/%f0%9f%90%8d
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
Disallow: /v2/%f0
Disallow: /v3/\udcf0 # raw non-ASCII octet
Disallow: /p1%xy # raw percent
Disallow: /p2%
Disallow: /p3%25xy # percent-encoded percent
Disallow: /p4%2525xy # double percent-encoded percent
Disallow: /john%20smith # space
Disallow: /john doe
Disallow: /trailingspace%20
Disallow: /question%3Fq=v # not query
Disallow: /hash%23f # not fragment
Disallow: /dollar%24
Disallow: /asterisk%2A
Disallow: /sub/dir
Disallow: /slash%2F
Disallow: /query/question?q=%3F
Disallow: /query/raw/question?q=?
Disallow: /query/eq?q%3Dv
Disallow: /query/amp?q=v%26a
"""
good = [
'/u1/%F0', '/u1/%f0',
'/u2/%F0', '/u2/%f0',
'/u3/%F0', '/u3/%f0',
'/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
'/question?q=v',
'/dollar', '/asterisk',
'/query/eq?q=v',
'/query/amp?q=v&a',
]
bad = [
'/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
'/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
'/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
'/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
'/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
'/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
'/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
'/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
'/p1%xy', '/p1%25xy',
'/p2%', '/p2%25', '/p2%2525', '/p2%xy',
'/p3%xy', '/p3%25xy',
'/p4%2525xy',
'/john%20smith', '/john smith',
'/john%20doe', '/john doe',
'/trailingspace%20', '/trailingspace ',
'/question%3Fq=v',
'/hash#f', '/hash%23f',
'/dollar$', '/dollar%24',
'/asterisk*', '/asterisk%2A',
'/sub/dir', '/sub%2Fdir',
'/slash%2F', '/slash/',
'/query/question?q=?', '/query/question?q=%3F',
'/query/raw/question?q=?', '/query/raw/question?q=%3F',
'/query/eq?q%3Dv',
'/query/amp?q=v%26a',
]
# other reserved characters
for c in ":/[]@!$&'()*+,;=":
robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
bad.append(f'/raw{c}')
bad.append(f'/raw%{ord(c):02X}')
bad.append(f'/pc{c}')
bad.append(f'/pc%{ord(c):02X}')


class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
Expand Down Expand Up @@ -299,26 +378,17 @@ def test_string_formatting(self):
self.assertEqual(str(self.parser), self.expected_output)


class RobotHandler(BaseHTTPRequestHandler):

def do_GET(self):
self.send_error(403, "Forbidden access")

def log_message(self, format, *args):
pass


@unittest.skipUnless(
support.has_socket_support,
"Socket server requires working socket."
)
class PasswordProtectedSiteTestCase(unittest.TestCase):
class BaseLocalNetworkTestCase:

def setUp(self):
# clear _opener global variable
self.addCleanup(urllib.request.urlcleanup)

self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)

self.t = threading.Thread(
name='HTTPServer serving',
Expand All @@ -335,6 +405,57 @@ def tearDown(self):
self.t.join()
self.server.server_close()


SAMPLE_ROBOTS_TXT = b'''\
User-agent: test_robotparser
Disallow: /utf8/\xf0\x9f\x90\x8d
Disallow: /non-utf8/\xf0
Disallow: //[spam]/path
'''


class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):

def do_GET(self):
self.send_response(200)
self.end_headers()
self.wfile.write(SAMPLE_ROBOTS_TXT)

def log_message(self, format, *args):
pass

@threading_helper.reap_threads
def testRead(self):
# Test that reading a weird robots.txt doesn't fail.
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + '/robots.txt'
parser = urllib.robotparser.RobotFileParser()
parser.set_url(robots_url)
parser.read()
# And it can even interpret the weird paths in some reasonable way.
agent = 'test_robotparser'
self.assertTrue(parser.can_fetch(agent, robots_url))
self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))


class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):

def do_GET(self):
self.send_error(403, "Forbidden access")

def log_message(self, format, *args):
pass

@threading_helper.reap_threads
def testPasswordProtectedSite(self):
addr = self.server.server_address
Expand Down
33 changes: 22 additions & 11 deletions Lib/urllib/robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"""

import collections
import re
import urllib.error
import urllib.parse
import urllib.request
Expand All @@ -20,6 +21,19 @@
RequestRate = collections.namedtuple("RequestRate", "requests seconds")


def normalize(path):
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
return urllib.parse.quote(unquoted, errors='surrogateescape')

def normalize_path(path):
path, sep, query = path.partition('?')
path = normalize(path)
if sep:
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
path += '?' + query
return path


class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
Expand Down Expand Up @@ -55,7 +69,7 @@ def modified(self):
def set_url(self, url):
"""Sets the URL referring to a robots.txt file."""
self.url = url
self.host, self.path = urllib.parse.urlparse(url)[1:3]
self.host, self.path = urllib.parse.urlsplit(url)[1:3]

def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
Expand All @@ -69,7 +83,7 @@ def read(self):
err.close()
else:
raw = f.read()
self.parse(raw.decode("utf-8").splitlines())
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())

def _add_entry(self, entry):
if "*" in entry.useragents:
Expand Down Expand Up @@ -113,7 +127,7 @@ def parse(self, lines):
line = line.split(':', 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
line[1] = urllib.parse.unquote(line[1].strip())
line[1] = line[1].strip()
if line[0] == "user-agent":
if state == 2:
self._add_entry(entry)
Expand Down Expand Up @@ -167,10 +181,9 @@ def can_fetch(self, useragent, url):
return False
# search for given user agent matches
# the first match counts
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
url = urllib.parse.urlunparse(('','',parsed_url.path,
parsed_url.params,parsed_url.query, parsed_url.fragment))
url = urllib.parse.quote(url)
parsed_url = urllib.parse.urlsplit(url)
url = urllib.parse.urlunsplit(('', '', *parsed_url[2:]))
url = normalize_path(url)
if not url:
url = "/"
for entry in self.entries:
Expand Down Expand Up @@ -213,16 +226,14 @@ def __str__(self):
entries = entries + [self.default_entry]
return '\n\n'.join(map(str, entries))


class RuleLine:
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
(allowance==False) followed by a path."""
def __init__(self, path, allowance):
if path == '' and not allowance:
# an empty value means allow all
allowance = True
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
self.path = urllib.parse.quote(path)
self.path = normalize_path(path)
self.allowance = allowance

def applies_to(self, filename):
Expand Down Expand Up @@ -268,7 +279,7 @@ def applies_to(self, useragent):
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- filename is URL decoded"""
- filename is URL encoded"""
for line in self.rulelines:
if line.applies_to(filename):
return line.allowance
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix parsing errors in the :mod:`urllib.robotparser` module.
Don't fail trying to parse weird paths.
Don't fail trying to decode non-UTF-8 ``robots.txt`` files.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix normalization of the ``robots.txt`` rules and URLs in the
:mod:`urllib.robotparser` module. No longer ignore trailing ``?``.
Distinguish raw special characters ``?``, ``=`` and ``&`` from the
percent-encoded ones.
Loading