From 9fa9fb6117e0e603dd2b01a8223ca8d6d500b21b Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 18 Aug 2006 15:47:10 -0400 Subject: [PATCH] Change planet:subscription to planet:source, and expand the information provided in each. --- examples/planet.xslt | 6 ++- planet/__init__.py | 2 + planet/config.py | 8 ++- planet/feedparser.py | 6 ++- planet/reconstitute.py | 18 +++---- planet/spider.py | 54 ++++++++++++------- planet/splice.py | 25 +++++---- spider.py | 1 + splice.py | 1 + tests/data/spider/config.ini | 3 ++ tests/data/splice/cache/example.com,3 | 15 ++++++ tests/data/splice/cache/example.com,4 | 15 ++++++ ... planet.intertwingly.net,2006,testfeed1,1} | 0 ... planet.intertwingly.net,2006,testfeed1,2} | 0 ... planet.intertwingly.net,2006,testfeed1,3} | 0 ... planet.intertwingly.net,2006,testfeed1,4} | 0 ... planet.intertwingly.net,2006,testfeed2,1} | 0 ... planet.intertwingly.net,2006,testfeed2,2} | 0 ... planet.intertwingly.net,2006,testfeed2,3} | 0 ... planet.intertwingly.net,2006,testfeed2,4} | 0 .../planet.intertwingly.net,2006,testfeed3,1 | 15 ++++++ .../planet.intertwingly.net,2006,testfeed3,2 | 15 ++++++ .../sources/tests,data,spider,testfeed1b.atom | 15 ++++++ .../sources/tests,data,spider,testfeed2.atom | 15 ++++++ .../sources/tests,data,spider,testfeed3.rss | 8 +++ tests/data/splice/config.ini | 5 ++ tests/test_spider.py | 13 +++-- tests/test_splice.py | 6 +-- 28 files changed, 195 insertions(+), 51 deletions(-) mode change 100644 => 100755 spider.py mode change 100644 => 100755 splice.py create mode 100644 tests/data/splice/cache/example.com,3 create mode 100644 tests/data/splice/cache/example.com,4 rename tests/data/splice/cache/{tag:planet.intertwingly.net,2006:testfeed1,1 => planet.intertwingly.net,2006,testfeed1,1} (100%) rename tests/data/splice/cache/{tag:planet.intertwingly.net,2006:testfeed1,2 => planet.intertwingly.net,2006,testfeed1,2} (100%) rename tests/data/splice/cache/{tag:planet.intertwingly.net,2006:testfeed1,3 => planet.intertwingly.net,2006,testfeed1,3} (100%) rename tests/data/splice/cache/{tag:planet.intertwingly.net,2006:testfeed1,4 => planet.intertwingly.net,2006,testfeed1,4} (100%) rename tests/data/splice/cache/{tag:planet.intertwingly.net,2006:testfeed2,1 => planet.intertwingly.net,2006,testfeed2,1} (100%) rename tests/data/splice/cache/{tag:planet.intertwingly.net,2006:testfeed2,2 => planet.intertwingly.net,2006,testfeed2,2} (100%) rename tests/data/splice/cache/{tag:planet.intertwingly.net,2006:testfeed2,3 => planet.intertwingly.net,2006,testfeed2,3} (100%) rename tests/data/splice/cache/{tag:planet.intertwingly.net,2006:testfeed2,4 => planet.intertwingly.net,2006,testfeed2,4} (100%) create mode 100644 tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1 create mode 100644 tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2 create mode 100644 tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom create mode 100644 tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom create mode 100644 tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss diff --git a/examples/planet.xslt b/examples/planet.xslt index 420b0ea..c9bc464 100644 --- a/examples/planet.xslt +++ b/examples/planet.xslt @@ -19,13 +19,15 @@

Subscriptions

diff --git a/planet/__init__.py b/planet/__init__.py index 7cade8c..d66a958 100644 --- a/planet/__init__.py +++ b/planet/__init__.py @@ -1,3 +1,5 @@ +xmlns = 'http://planet.intertwingly.net/' + logger = None def getLogger(level): diff --git a/planet/config.py b/planet/config.py index 855d038..2e657af 100644 --- a/planet/config.py +++ b/planet/config.py @@ -26,7 +26,7 @@ * error handling (example: no planet section) """ -import sys +import os, sys from ConfigParser import ConfigParser parser = ConfigParser() @@ -83,6 +83,12 @@ def template_files(): """ list the templates defined """ return parser.get('Planet','template_files').split(' ') +def cache_sources_directory(): + if parser.has_option('Planet', 'cache_sources_directory'): + parser.get('Planet', 'cache_sources_directory') + else: + return os.path.join(cache_directory(), 'sources') + def feeds(): """ list the feeds defined """ return filter(lambda feed: feed!='Planet' and feed not in template_files(), diff --git a/planet/feedparser.py b/planet/feedparser.py index 3ac0fda..a99c85a 100755 --- a/planet/feedparser.py +++ b/planet/feedparser.py @@ -11,7 +11,7 @@ Recommended: CJKCodecs and iconv_codec """ -__version__ = "4.2-pre-" + "$Revision: 1.131 $"[11:16] + "-cvs" +__version__ = "4.2-pre-" + "$Revision: 1.132 $"[11:16] + "-cvs" __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -2379,12 +2379,16 @@ def handle_data(self, text): _BaseHTMLProcessor.handle_data(self, text) def sanitize_style(self, style): + # disallow urls + style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) + # gauntlet if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return '' clean = [] for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): + if not value: continue if prop.lower() in self.acceptable_css_properties: clean.append(prop + ': ' + value + ';') elif prop.split('-')[0].lower() in ['background','border','margin','padding']: diff --git a/planet/reconstitute.py b/planet/reconstitute.py index b8b090d..2f57726 100644 --- a/planet/reconstitute.py +++ b/planet/reconstitute.py @@ -18,6 +18,7 @@ from xml.dom import minidom from BeautifulSoup import BeautifulSoup from xml.parsers.expat import ExpatError +import planet illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") @@ -141,10 +142,9 @@ def content(xentry, name, detail, bozo): xentry.appendChild(xcontent) -def source(xentry, source, bozo): +def source(xsource, source, bozo): """ copy source information to the entry """ - xdoc = xentry.ownerDocument - xsource = xdoc.createElement('source') + xdoc = xsource.ownerDocument createTextElement(xsource, 'id', source.get('id', None)) createTextElement(xsource, 'icon', source.get('icon', None)) @@ -164,16 +164,14 @@ def source(xentry, source, bozo): # propagate planet inserted information for key, value in source.items(): - if key.startswith('planet:'): - createTextElement(xsource, key, value) - - xentry.appendChild(xsource) + if key.startswith('planet_'): + createTextElement(xsource, key.replace('_',':',1), value) def reconstitute(feed, entry): """ create an entry document from a parsed feed """ xdoc=minidom.parseString('\n') xentry=xdoc.documentElement - xentry.setAttribute('xmlns:planet','http://planet.intertwingly.net/') + xentry.setAttribute('xmlns:planet',planet.xmlns) id(xentry, entry) links(xentry, entry) @@ -191,6 +189,8 @@ def reconstitute(feed, entry): for contributor in entry.get('contributors',[]): author(xentry, 'contributor', contributor) - source(xentry, entry.get('source', feed.feed), bozo) + xsource = xdoc.createElement('source') + source(xsource, entry.get('source', feed.feed), bozo) + xentry.appendChild(xsource) return xdoc diff --git a/planet/spider.py b/planet/spider.py index 454ef5c..f411463 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -5,8 +5,9 @@ # Standard library modules import time, calendar, re, os +from xml.dom import minidom # Planet modules -import config, feedparser, reconstitute +import planet, config, feedparser, reconstitute try: from xml.dom.ext import PrettyPrint @@ -40,15 +41,45 @@ def filename(directory, filename): return os.path.join(directory, filename) +def write(xdoc, out): + """ write the document out to disk """ + file = open(out,'w') + try: + PrettyPrint(xdoc, file) + except: + # known reasons for failure include no pretty printer installed, + # and absurdly high levels of markup nesting causing Python to + # declare infinite recursion. + file.seek(0) + file.write(xdoc.toxml('utf-8')) + file.close() + xdoc.unlink() + def spiderFeed(feed): """ Spider (fetch) a single feed """ data = feedparser.parse(feed) - cache = config.cache_directory() + if not data.feed: return - # capture data from the planet configuration file + # capture feed and data from the planet configuration file + if not data.feed.has_key('links'): data.feed['links'] = list() + for link in data.feed.links: + if link.rel == 'self': break + else: + data.feed.links.append(feedparser.FeedParserDict( + {'rel':'self', 'type':'application/atom+xml', 'href':feed})) for name, value in config.feed_options(feed).items(): - data.feed['planet:'+name] = value + data.feed['planet_'+name] = value + # write the feed info to the cache + sources = config.cache_sources_directory() + if not os.path.exists(sources): os.makedirs(sources) + xdoc=minidom.parseString('''\n''' % planet.xmlns) + reconstitute.source(xdoc.documentElement, data.feed, data.bozo) + write(xdoc, filename(sources, feed)) + + # write each entry to the cache + cache = config.cache_directory() for entry in data.entries: if not entry.has_key('id'): entry['id'] = reconstitute.id(None, entry) @@ -65,24 +96,11 @@ def spiderFeed(feed): mtime = time.time() entry['updated_parsed'] = time.gmtime(mtime) - xml = reconstitute.reconstitute(data, entry) - - file = open(out,'w') - try: - PrettyPrint(reconstitute.reconstitute(data, entry), file) - except: - # known reasons for failure include no pretty printer installed, - # and absurdly high levels of markup nesting causing Python to - # declare infinite recursion. - file.seek(0) - file.write(reconstitute.reconstitute(data, entry).toxml('utf-8')) - file.close() - + write(reconstitute.reconstitute(data, entry), out) os.utime(out, (mtime, mtime)) def spiderPlanet(configFile): """ Spider (fetch) an entire planet """ - import planet config.load(configFile) log = planet.getLogger(config.log_level()) planet.setTimeout(config.feed_timeout()) diff --git a/planet/splice.py b/planet/splice.py index cbc5740..55f9739 100644 --- a/planet/splice.py +++ b/planet/splice.py @@ -1,8 +1,9 @@ """ Splice together a planet from a cache of feed entries """ import glob, os from xml.dom import minidom -import config +import planet, config, feedparser, reconstitute from reconstitute import createTextElement +from spider import filename def splice(configFile): """ Splice together a planet from a cache of entries """ @@ -11,7 +12,8 @@ def splice(configFile): log = planet.getLogger(config.log_level()) cache = config.cache_directory() - dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")] + dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") + if not os.path.isdir(file)] dir.sort() dir.reverse() @@ -34,17 +36,14 @@ def splice(configFile): feed.appendChild(entry.documentElement) # insert subscription information - feed.setAttribute('xmlns:planet','http://planet.intertwingly.net/') + feed.setAttribute('xmlns:planet',planet.xmlns) + sources = config.cache_sources_directory() for sub in config.feeds(): - name = config.feed_options(sub).get('name','') - xsub = doc.createElement('planet:subscription') - xlink = doc.createElement('link') - xlink.setAttribute('rel','self') - xlink.setAttribute('href',sub.decode('utf-8')) - xsub.appendChild(xlink) - xname = doc.createElement('planet:name') - xname.appendChild(doc.createTextNode(name.decode('utf-8'))) - xsub.appendChild(xname) - feed.appendChild(xsub) + data=feedparser.parse(filename(sources,sub)) + if not data.feed: continue + xdoc=minidom.parseString('''\n''' % planet.xmlns) + reconstitute.source(xdoc.documentElement, data.feed, data.bozo) + feed.appendChild(xdoc.documentElement) return doc diff --git a/spider.py b/spider.py old mode 100644 new mode 100755 index 39b73ee..e4aab2e --- a/spider.py +++ b/spider.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Main program to run just the spider portion of planet """ diff --git a/splice.py b/splice.py old mode 100644 new mode 100755 index e98a559..e5ed424 --- a/splice.py +++ b/splice.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Main program to run just the splice portion of planet """ diff --git a/tests/data/spider/config.ini b/tests/data/spider/config.ini index b4b2075..7b38417 100644 --- a/tests/data/spider/config.ini +++ b/tests/data/spider/config.ini @@ -2,6 +2,9 @@ cache_directory = tests/work/spider/cache template_files = +[tests/data/spider/testfeed0.atom] +name = not found + [tests/data/spider/testfeed1b.atom] name = one diff --git a/tests/data/splice/cache/example.com,3 b/tests/data/splice/cache/example.com,3 new file mode 100644 index 0000000..df0943b --- /dev/null +++ b/tests/data/splice/cache/example.com,3 @@ -0,0 +1,15 @@ + + + http://example.com/3 + + Earth + the Blue Planet + 2006-01-03T00:00:00Z + + + + It’s just data + Sam Ruby + three + + diff --git a/tests/data/splice/cache/example.com,4 b/tests/data/splice/cache/example.com,4 new file mode 100644 index 0000000..bc229ff --- /dev/null +++ b/tests/data/splice/cache/example.com,4 @@ -0,0 +1,15 @@ + + + http://example.com/4 + + Mars + the Red Planet + 2006-08-18T18:30:50Z + + + + It’s just data + Sam Ruby + three + + diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,1 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,1 similarity index 100% rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,1 rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,1 diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,2 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,2 similarity index 100% rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,2 rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,2 diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,3 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,3 similarity index 100% rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,3 rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,3 diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,4 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,4 similarity index 100% rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,4 rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,4 diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,1 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,1 similarity index 100% rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,1 rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,1 diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,2 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,2 similarity index 100% rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,2 rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,2 diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,3 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,3 similarity index 100% rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,3 rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,3 diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,4 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,4 similarity index 100% rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,4 rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,4 diff --git a/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1 new file mode 100644 index 0000000..5ca9f26 --- /dev/null +++ b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1 @@ -0,0 +1,15 @@ + + + tag:planet.intertwingly.net,2006:testfeed3/1 + + Mercury + Messenger of the Roman Gods + 2006-01-01T00:00:00Z + + + + It’s just data + Sam Ruby + three + + diff --git a/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2 new file mode 100644 index 0000000..f5acd6b --- /dev/null +++ b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2 @@ -0,0 +1,15 @@ + + + tag:planet.intertwingly.net,2006:testfeed3/2 + + Venus + the Morning Star + 2006-08-18T18:30:50Z + + + + It’s just data + Sam Ruby + three + + diff --git a/tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom b/tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom new file mode 100644 index 0000000..8cb9e5c --- /dev/null +++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom @@ -0,0 +1,15 @@ + + + tag:planet.intertwingly.net,2006:testfeed1 + + Sam Ruby + rubys@intertwingly.net + http://www.intertwingly.net/blog/ + + + + It’s just data + Sam Ruby + 2006-06-17T00:15:18Z + one + diff --git a/tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom b/tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom new file mode 100644 index 0000000..6aeb0ab --- /dev/null +++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom @@ -0,0 +1,15 @@ + + + tag:planet.intertwingly.net,2006:testfeed2 + + Sam Ruby + rubys@intertwingly.net + http://www.intertwingly.net/blog/ + + + + It’s just data + Sam Ruby + 2006-06-17T00:15:18Z + two + diff --git a/tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss b/tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss new file mode 100644 index 0000000..c464236 --- /dev/null +++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss @@ -0,0 +1,8 @@ + + + + + It’s just data + Sam Ruby + three + diff --git a/tests/data/splice/config.ini b/tests/data/splice/config.ini index f700e73..0ba74c3 100644 --- a/tests/data/splice/config.ini +++ b/tests/data/splice/config.ini @@ -3,9 +3,14 @@ name = test planet cache_directory = tests/data/splice/cache template_files = +[tests/data/spider/testfeed0.atom] +name = not found + [tests/data/spider/testfeed1b.atom] name = one [tests/data/spider/testfeed2.atom] name = two +[tests/data/spider/testfeed3.rss] +name = three diff --git a/tests/test_spider.py b/tests/test_spider.py index 3bd9284..1f945e8 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -17,6 +17,10 @@ def setUp(self): os.makedirs(workdir) def tearDown(self): + for file in glob.glob(workdir+"/sources/*"): + os.unlink(file) + if os.path.exists(workdir+"/sources"): + os.rmdir(workdir+"/sources") for file in glob.glob(workdir+"/*"): os.unlink(file) os.removedirs(workdir) @@ -36,8 +40,8 @@ def test_spiderFeed(self): spiderFeed(testfeed % '1b') files = glob.glob(workdir+"/*") - # verify that exactly four files were produced - self.assertEqual(4, len(files)) + # verify that exactly four files + one sources dir were produced + self.assertEqual(5, len(files)) # verify that the file names are as expected self.assertTrue(workdir + @@ -45,6 +49,7 @@ def test_spiderFeed(self): # verify that the file timestamps match atom:updated for file in files: + if file.endswith('/sources'): continue data = feedparser.parse(file) self.assertTrue(data.entries[0].source.planet_name) self.assertEqual(os.stat(file).st_mtime, @@ -58,8 +63,8 @@ def test_spiderPlanet(self): spiderPlanet(configfile) files = glob.glob(workdir+"/*") - # verify that exactly eight files were produced - self.assertEqual(12, len(files)) + # verify that exactly eight files + 1 source dir were produced + self.assertEqual(13, len(files)) # verify that the file names are as expected self.assertTrue(workdir + diff --git a/tests/test_splice.py b/tests/test_splice.py index 2b1a5ee..99fb446 100644 --- a/tests/test_splice.py +++ b/tests/test_splice.py @@ -9,9 +9,9 @@ class SpliceTest(unittest.TestCase): def test_splice(self): doc = splice(configfile) - self.assertEqual(8,len(doc.getElementsByTagName('entry'))) - self.assertEqual(2,len(doc.getElementsByTagName('planet:subscription'))) - self.assertEqual(10,len(doc.getElementsByTagName('planet:name'))) + self.assertEqual(12,len(doc.getElementsByTagName('entry'))) + self.assertEqual(3,len(doc.getElementsByTagName('planet:source'))) + self.assertEqual(15,len(doc.getElementsByTagName('planet:name'))) self.assertEqual('test planet', doc.getElementsByTagName('title')[0].firstChild.nodeValue)