Permalink
Browse files

Added more unit testing, fixed a few bugs, documented the matching.

  • Loading branch information...
1 parent 00ad3c5 commit c20e79341c10bb0692ae3883d063efc33314c048 Dan Lecocq committed Oct 29, 2011
Showing with 83 additions and 10 deletions.
  1. +13 −0 README.md
  2. +12 −10 reppy/__init__.py
  3. +58 −0 tests/testReppy.py
View
@@ -13,6 +13,19 @@ Features
- Batch queries
- Configurable user agent for fetching robots.txt
- Automatic refetching basing on expiration
+- Support for Crawl-delay
+- Support for Sitemaps
+- Wildcard matching
+
+Matching
+========
+
+This package supports the [1996 RFC](http://www.robotstxt.org/norobots-rfc.txt), as well
+as additional commonly-implemented features, like wildcard matching, crawl-delay, and
+sitemaps. There are varying approaches to matching `Allow` and `Disallow`. One approach
+is to use the longest match. Another is to use the most specific. This package chooses to
+follow the directive that is longest, the assumption being that it's the one that is
+most specific -- a term that is a little difficult to define in this context.
Usage
=====
View
@@ -72,11 +72,11 @@ def findOrMakeRobot(url, agent, agentString):
'''Either return the appropriate global reppy object, or make one'''
global robots
parsed = urlparse.urlparse(url)
- robot = robots.get(parsed.hostname, None)
+ robot = robots.get(parsed.netloc, None)
if not robot:
- robot = reppy.fetch('%s://%s/robots.txt' % (parsed.scheme, parsed.hostname),
+ robot = reppy.fetch('%s://%s/robots.txt' % (parsed.scheme, parsed.netloc),
userAgent=agent, userAgentString=(agentString or getUserAgentString(agent)))
- robots[parsed.hostname] = robot
+ robots[parsed.netloc] = robot
return robot
def allowed(url, agent, agentString=None):
@@ -99,14 +99,16 @@ def sitemaps(url):
return findOrMakeRobot(url).sitemaps
class agent(object):
+ pathRE = re.compile(r'^([^\/]+\/\/)?([^\/]+)?(/?.+?)$')
'''Represents attributes for a given robot'''
def __init__(self):
self.allowances = []
self.crawlDelay = None
def allowed(self, url):
'''Can I fetch a given URL?'''
- path = urllib.unquote(urlparse.urlparse(url).path.replace('%2f', '%252f'))
+ match = agent.pathRE.match(url)
+ path = urllib.unquote(agent.pathRE.match(url).group(3).replace('%2f', '%252f'))
if path == '/robots.txt':
return True
allowed = [a for a in self.allowances if a[1].match(path)]
@@ -123,7 +125,7 @@ class reppy(object):
'''A class that represents a set of agents, and can select them appropriately.
Associated with one robots.txt file.'''
- lineRE = re.compile('^\s*(\S+)\s*:\s*(.*)\s*(#.+)?$', re.I)
+ lineRE = re.compile('^\s*(\S+)\s*:\s*([^#]*)\s*(#.+)?$', re.I)
def __init__(self, ttl=3600*3, url=None, autorefresh=True, userAgent='REPParser/0.1 (Python)', userAgentString=None):
self.reset()
@@ -191,8 +193,8 @@ def refresh(self):
def makeREFromString(self, s):
'''Make a regular expression that matches the patterns expressable in robots.txt'''
- tmp = s.replace('%2f', '%252f').replace('*', '.+').replace('$', '.+')
- return re.compile(urllib.unquote(tmp))
+ tmp = re.escape(urllib.unquote(s.replace('%2f', '%252f')))
+ return re.compile(tmp.replace('\*', '.*').replace('\$', '$'))
def parse(self, s):
'''Parse the given string and store the resultant rules'''
@@ -205,12 +207,12 @@ def parse(self, s):
for line in s.split('\n'):
match = self.lineRE.match(line)
if match:
- key = match.group(1).lower()
- val = match.group(2)
+ key = match.group(1).strip().lower()
+ val = match.group(2).strip()
if key == 'user-agent':
# Store the current working agent
self.atts['agents'][curname] = cur
- curname = val
+ curname = val.lower()
if last != 'user-agent':
# If the last line was a user agent, then all lines
# below also apply to the last user agent. So, we'll
View
@@ -11,6 +11,9 @@
import random
import unittest
+import logging
+reppy.logger.setLevel(logging.FATAL)
+
class TestReppyRFC(unittest.TestCase):
def test_basic(self):
# Test beginning matching
@@ -202,6 +205,61 @@ def test_case_insensitivity(self):
self.assertTrue(r.disallowed('/a', 'aGent'))
self.assertTrue(r.disallowed('/a', 'AGeNt'))
self.assertTrue(r.disallowed('/a', 'AGENT'))
+
+ def test_query(self):
+ '''Make sure user agent matches are case insensitive'''
+ r = reppy.parse('''
+ User-agent: agent
+ Disallow: /a?howdy''')
+ self.assertTrue( r.allowed('/a', 'agent'))
+ self.assertTrue(not r.allowed('/a?howdy', 'agent'))
+ self.assertTrue(not r.allowed('/a?howdy#fragment', 'agent'))
+ self.assertTrue( r.allowed('/a?heyall', 'agent'))
+
+ def test_allow_all(self):
+ # Now test escaping entities
+ r = reppy.parse('''
+ User-agent: *
+ Disallow: ''')
+ ua = 'dotbot'
+ self.assertTrue( r.allowed('/', ua))
+ self.assertTrue( r.allowed('/foo', ua))
+ self.assertTrue( r.allowed('/foo.html', ua))
+ self.assertTrue( r.allowed('/foo/bar', ua))
+ self.assertTrue( r.allowed('/foo/bar.html', ua))
+
+ def test_disallow_all(self):
+ # But not with foward slash
+ r = reppy.parse('''
+ User-agent: *
+ Disallow: /''')
+ ua = 'dotbot'
+ self.assertTrue(not r.allowed('/', ua))
+ self.assertTrue(not r.allowed('/foo', ua))
+ self.assertTrue(not r.allowed('/foo.html', ua))
+ self.assertTrue(not r.allowed('/foo/bar', ua))
+ self.assertTrue(not r.allowed('/foo/bar.html', ua))
+
+ def test_allow_certain_pages_only(self):
+ r = reppy.parse('''
+ User-agent: *
+ Allow: /onepage.html
+ Allow: /oneotherpage.php
+ Disallow: /
+ Allow: /subfolder/page1.html
+ Allow: /subfolder/page2.php
+ Disallow: /subfolder/''')
+ ua = 'dotbot'
+ self.assertTrue(not r.allowed('/', ua))
+ self.assertTrue(not r.allowed('/foo', ua))
+ self.assertTrue(not r.allowed('/bar.html', ua))
+ self.assertTrue( r.allowed('/onepage.html', ua))
+ self.assertTrue( r.allowed('/oneotherpage.php', ua))
+ self.assertTrue(not r.allowed('/subfolder', ua))
+ self.assertTrue(not r.allowed('/subfolder/', ua))
+ self.assertTrue(not r.allowed('/subfolder/aaaaa', ua))
+ self.assertTrue( r.allowed('/subfolder/page1.html', ua))
+ self.assertTrue( r.allowed('/subfolder/page2.php', ua))
if __name__ == '__main__':
unittest.main()

0 comments on commit c20e793

Please sign in to comment.