Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added expunge and preliminary test cases

  • Loading branch information...
commit a51d09ec07d49610c11dee3bcf514c0abec4aa94 1 parent 567eb64
@mortenf mortenf authored
View
2  THANKS
@@ -9,7 +9,7 @@ Harry Fuecks - Pipe characters in file names, filter bug
Eric van der Vlist - Filters to add language, category information
Chris Dolan - mkdir cache; default template_dirs; fix xsltproc
David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/
-Morten Fredericksen - Support WordPress LinkManager OPML
+Morten Frederiksen - Support WordPress LinkManager OPML
Harry Fuecks - default item date to feed date
Antonio Cavedoni - Django templates
View
4 docs/config.html
@@ -111,6 +111,10 @@ <h3 id="planet"><code>[planet]</code></h3>
directory to be used for an additional HTTP cache to front end the Venus
cache. If specified as a relative path, it is evaluated relative to the
<code>cache_directory</code>.</dd>
+<dt><ins>cache_keep_entries</ins></dt>
+<dd>Used by <code>expunge</code> to determine how many entries should be
+kept for each source when expunging old entries from the cache directory.
+This may be overriden on a per subscription feed basis.</dd>
</dl>
<p>Additional options can be found in
<a href="normalization.html#overrides">normalization level overrides</a>.</p>
View
17 expunge.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+"""
+Main program to run just the expunge portion of planet
+"""
+
+import os.path
+import sys
+from planet import expunge, config
+
+if __name__ == '__main__':
+
+ if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
+ config.load(sys.argv[1])
+ expunge.expungeCache()
+ else:
+ print "Usage:"
+ print " python %s config.ini" % sys.argv[0]
View
8 planet.py
@@ -21,6 +21,7 @@
offline = 0
verbose = 0
only_if_new = 0
+ expunge = 0
for arg in sys.argv[1:]:
if arg == "-h" or arg == "--help":
@@ -31,6 +32,7 @@
print " -o, --offline Update the Planet from the cache only"
print " -h, --help Display this help message and exit"
print " -n, --only-if-new Only spider new feeds"
+ print " -x, --expunge Expunge old entries from cache"
print
sys.exit(0)
elif arg == "-v" or arg == "--verbose":
@@ -39,6 +41,8 @@
offline = 1
elif arg == "-n" or arg == "--only-if-new":
only_if_new = 1
+ elif arg == "-x" or arg == "--expunge":
+ expunge = 1
elif arg.startswith("-"):
print >>sys.stderr, "Unknown option:", arg
sys.exit(1)
@@ -62,3 +66,7 @@
from planet import splice
doc = splice.splice()
splice.apply(doc.toxml('utf-8'))
+
+ if expunge:
+ from planet import expunge
+ expunge.expungeCache
View
1  planet/config.py
@@ -107,6 +107,7 @@ def define_tmpl_int(name, default):
define_planet('spider_threads', 0)
define_planet_int('feed_timeout', 20)
+ define_planet_int('cache_keep_entries', 10)
define_planet_list('template_files')
define_planet_list('bill_of_materials')
View
68 planet/expunge.py
@@ -0,0 +1,68 @@
+""" Expunge old entries from a cache of entries """
+import glob, os, planet, config, feedparser
+from xml.dom import minidom
+from spider import filename
+
+def expungeCache():
+ """ Expunge old entries from a cache of entries """
+ import planet
+ log = planet.getLogger(config.log_level(),config.log_format())
+
+ log.info("Determining feed subscriptions")
+ entry_count = {}
+ sources = config.cache_sources_directory()
+ for sub in config.subscriptions():
+ data=feedparser.parse(filename(sources,sub))
+ if not data.feed.has_key('id'): continue
+ if config.feed_options(sub).has_key('cache_keep_entries'):
+ entry_count[data.feed.id] = config.feed_options(sub)['cache_keep_entries']
+ else:
+ entry_count[data.feed.id] = config.cache_keep_entries()
+
+ log.info("Listing cached entries")
+ cache = config.cache_directory()
+ dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
+ if not os.path.isdir(file)]
+ dir.sort()
+ dir.reverse()
+
+ for mtime,file in dir:
+
+ try:
+ entry=minidom.parse(file)
+ # determine source of entry
+ entry.normalize()
+ sources = entry.getElementsByTagName('source')
+ if not sources:
+ # no source determined, do not delete
+ log.debug("No source found for %s", file)
+ continue
+ ids = sources[0].getElementsByTagName('id')
+ if not ids:
+ # feed id not found, do not delete
+ log.debug("No source feed id found for %s", file)
+ continue
+ if ids[0].childNodes[0].nodeValue in entry_count:
+ # subscribed to feed, update entry count
+ entry_count[ids[0].childNodes[0].nodeValue] = entry_count[
+ ids[0].childNodes[0].nodeValue] - 1
+ if entry_count[ids[0].childNodes[0].nodeValue] >= 0:
+ # maximum not reached, do not delete
+ log.debug("Maximum not reached for %s from %s",
+ file, ids[0].childNodes[0].nodeValue)
+ continue
+ else:
+ # maximum reached
+ log.debug("Removing %s, maximum reached for %s",
+ file, ids[0].childNodes[0].nodeValue)
+ else:
+ # not subscribed
+ log.debug("Removing %s, not subscribed to %s",
+ file, ids[0].childNodes[0].nodeValue)
+ # remove old entry
+ #os.unlink(file)
+
+ except:
+ log.error("Error parsing %s", file)
+
+# end of expungeCache()
View
20 tests/data/expunge/config.ini
@@ -0,0 +1,20 @@
+[Planet]
+name = test planet
+cache_directory = tests/work/expunge/cache
+cache_keep_entries = 1
+
+[tests/data/expunge/testfeed1.atom]
+name = no source
+
+[tests/data/expunge/testfeed2.atom]
+name = no source id
+
+[tests/data/expunge/testfeed3.atom]
+name = global setting
+
+[tests/data/expunge/testfeed4.atom]
+name = local setting
+cache_keep_entries = 2
+
+#[tests/data/expunge/testfeed5.atom]
+#name = unsubbed
View
8 tests/data/expunge/test1.entry
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test1/1</id>
+ <link href="http://example.com/1/1"/>
+ <title>Test 1/1</title>
+ <content>Entry with missing source</content>
+ <updated>2007-03-01T01:01:00Z</updated>
+</entry>
View
11 tests/data/expunge/test2.entry
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test2/1</id>
+ <link href="http://example.com/2/1"/>
+ <title>Test 2/1</title>
+ <content>Entry with missing source id</content>
+ <updated>2007-03-01T02:01:00Z</updated>
+ <source>
+ <title>Test 2/1 source</title>
+ </source>
+</entry>
View
12 tests/data/expunge/test3a.entry
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/1</id>
+ <link href="http://example.com/3/1"/>
+ <title>Test 3/1</title>
+ <content>Entry for global setting 1</content>
+ <updated>2007-03-01T03:01:00Z</updated>
+ <source>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
+ <title>Test 3 source</title>
+ </source>
+</entry>
View
12 tests/data/expunge/test3b.entry
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/2</id>
+ <link href="http://example.com/3/2"/>
+ <title>Test 3/2</title>
+ <content>Entry for global setting 2</content>
+ <updated>2007-03-01T03:02:00Z</updated>
+ <source>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
+ <title>Test 3 source</title>
+ </source>
+</entry>
View
12 tests/data/expunge/test3c.entry
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/3</id>
+ <link href="http://example.com/3/3"/>
+ <title>Test 3/3</title>
+ <content>Entry for global setting 3</content>
+ <updated>2007-03-01T03:03:00Z</updated>
+ <source>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
+ <title>Test 3 source</title>
+ </source>
+</entry>
View
12 tests/data/expunge/test4a.entry
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/1</id>
+ <link href="http://example.com/4/1"/>
+ <title>Test 4/1</title>
+ <content>Entry for local setting 1</content>
+ <updated>2007-03-01T04:01:00Z</updated>
+ <source>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
+ <title>Test 4 source</title>
+ </source>
+</entry>
View
12 tests/data/expunge/test4b.entry
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/2</id>
+ <link href="http://example.com/4/2"/>
+ <title>Test 4/2</title>
+ <content>Entry for local setting 2</content>
+ <updated>2007-03-01T04:02:00Z</updated>
+ <source>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
+ <title>Test 4 source</title>
+ </source>
+</entry>
View
12 tests/data/expunge/test4c.entry
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/3</id>
+ <link href="http://example.com/4/3"/>
+ <title>Test 4/3</title>
+ <content>Entry for local setting 3</content>
+ <updated>2007-03-01T04:03:00Z</updated>
+ <source>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
+ <title>Test 4 source</title>
+ </source>
+</entry>
View
12 tests/data/expunge/test5.entry
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<entry xmlns="http://www.w3.org/2005/Atom">
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test5/1</id>
+ <link href="http://example.com/5/1"/>
+ <title>Test 5/1</title>
+ <content>Entry from unsubbed feed</content>
+ <updated>2007-03-01T05:01:00Z</updated>
+ <source>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5</id>
+ <title>Test 5 source</title>
+ </source>
+</entry>
View
5 tests/data/expunge/testfeed1.atom
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed1.atom"/>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1</id>
+</feed>
View
5 tests/data/expunge/testfeed2.atom
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed2.atom"/>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2</id>
+</feed>
View
5 tests/data/expunge/testfeed3.atom
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed3.atom"/>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
+</feed>
View
5 tests/data/expunge/testfeed4.atom
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed4.atom"/>
+ <id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
+</feed>
View
145 tests/test_expunge.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+
+#import unittest, os, glob, calendar, shutil, time
+#from planet.spider import filename, spiderPlanet, writeCache
+#from planet import feedparser, config
+#import planet
+
+workdir = 'tests/work/expunge/cache'
+testfeed = 'tests/data/expunge/testfeed%s.atom'
+configfile = 'tests/data/expunge/config.ini'
+
+class ExpungeTest(unittest.TestCase):
+ def setUp(self):
+ # silence errors
+ planet.logger = None
+ planet.getLogger('CRITICAL',None)
+
+ try:
+ os.makedirs(workdir)
+ except:
+ self.tearDown()
+ os.makedirs(workdir)
+
+ def tearDown(self):
+ shutil.rmtree(workdir)
+ os.removedirs(os.path.split(workdir)[0])
+
+ def test_filename(self):
+ self.assertEqual(os.path.join('.', 'example.com,index.html'),
+ filename('.', 'http://example.com/index.html'))
+ self.assertEqual(os.path.join('.',
+ 'planet.intertwingly.net,2006,testfeed1,1'),
+ filename('.', u'tag:planet.intertwingly.net,2006:testfeed1,1'))
+ self.assertEqual(os.path.join('.',
+ '00000000-0000-0000-0000-000000000000'),
+ filename('.', u'urn:uuid:00000000-0000-0000-0000-000000000000'))
+
+ # Requires Python 2.3
+ try:
+ import encodings.idna
+ except:
+ return
+ self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
+ filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
+
+ def spiderFeed(self, feed_uri):
+ feed_info = feedparser.parse('<feed/>')
+ data = feedparser.parse(feed_uri)
+ writeCache(feed_uri, feed_info, data)
+
+ def verify_spiderFeed(self):
+ files = glob.glob(workdir+"/*")
+ files.sort()
+
+ # verify that exactly four files + one sources dir were produced
+ self.assertEqual(5, len(files))
+
+ # verify that the file names are as expected
+ self.assertTrue(os.path.join(workdir,
+ 'planet.intertwingly.net,2006,testfeed1,1') in files)
+
+ # verify that the file timestamps match atom:updated
+ data = feedparser.parse(files[2])
+ self.assertEqual(['application/atom+xml'], [link.type
+ for link in data.entries[0].source.links if link.rel=='self'])
+ self.assertEqual('one', data.entries[0].source.planet_name)
+ self.assertEqual('2006-01-03T00:00:00Z', data.entries[0].updated)
+ self.assertEqual(os.stat(files[2]).st_mtime,
+ calendar.timegm(data.entries[0].updated_parsed))
+
+ def test_spiderFeed(self):
+ config.load(configfile)
+ self.spiderFeed(testfeed % '1b')
+ self.verify_spiderFeed()
+
+ def test_spiderUpdate(self):
+ config.load(configfile)
+ self.spiderFeed(testfeed % '1a')
+ self.spiderFeed(testfeed % '1b')
+ self.verify_spiderFeed()
+
+ def verify_spiderPlanet(self):
+ files = glob.glob(workdir+"/*")
+
+ # verify that exactly eight files + 1 source dir were produced
+ self.assertEqual(14, len(files))
+
+ # verify that the file names are as expected
+ self.assertTrue(os.path.join(workdir,
+ 'planet.intertwingly.net,2006,testfeed1,1') in files)
+ self.assertTrue(os.path.join(workdir,
+ 'planet.intertwingly.net,2006,testfeed2,1') in files)
+
+ data = feedparser.parse(workdir +
+ '/planet.intertwingly.net,2006,testfeed3,1')
+ self.assertEqual(['application/rss+xml'], [link.type
+ for link in data.entries[0].source.links if link.rel=='self'])
+ self.assertEqual('three', data.entries[0].source.author_detail.name)
+ self.assertEqual('three', data.entries[0].source['planet_css-id'])
+
+ def test_spiderPlanet(self):
+ config.load(configfile)
+ spiderPlanet()
+ self.verify_spiderPlanet()
+
+ def test_spiderThreads(self):
+ config.load(configfile.replace('config','threaded'))
+ _PORT = config.parser.getint('Planet','test_port')
+
+ log = []
+ from SimpleHTTPServer import SimpleHTTPRequestHandler
+ class TestRequestHandler(SimpleHTTPRequestHandler):
+ def log_message(self, format, *args):
+ log.append(args)
+
+ from threading import Thread
+ class TestServerThread(Thread):
+ def __init__(self):
+ self.ready = 0
+ self.done = 0
+ Thread.__init__(self)
+ def run(self):
+ from BaseHTTPServer import HTTPServer
+ httpd = HTTPServer(('',_PORT), TestRequestHandler)
+ self.ready = 1
+ while not self.done:
+ httpd.handle_request()
+
+ httpd = TestServerThread()
+ httpd.start()
+ while not httpd.ready:
+ time.sleep(0.1)
+
+ try:
+ spiderPlanet()
+ finally:
+ httpd.done = 1
+ import urllib
+ urllib.urlopen('http://127.0.0.1:%d/' % _PORT).read()
+
+ status = [int(rec[1]) for rec in log if str(rec[0]).startswith('GET ')]
+ status.sort()
+ self.assertEqual([200,200,200,200,404], status)
+
+ self.verify_spiderPlanet()
Please sign in to comment.
Something went wrong with that request. Please try again.