Coercedates plugin with fixes to spider.py allowing filters to modify the updated date #16

Open
wants to merge 3 commits into from
View
117 filters/coercedates.plugin
@@ -0,0 +1,117 @@
+# If you don't want items to "move up" on your planet if the source feed
+# updates them (and changes the update date to something newer then was
+# originally used) you may be tempted to use the "ignore_in_feed: updated"
+# option, but there are three important things to realize about doing this:
+#
+# * When you ignore the "updated" date, it will default to the
+# "published" date -- but if there is no "published" date (very common
+# in many RSS feeds) it will default to the current date+time.
+#
+# * If you purge the entire cache (perhaps because you added a filter)
+# all of the "updated" dates for those items w/o a "published" date will
+# be re-set to the current date+time
+#
+# * The "updated" date is what Venus uses to sort the list
+#
+# This may seem all seem obvious, but can be highly annoying when you deal
+# with some feeds that have no "published" date and have to occasionally
+# purge your cache.
+#
+# One solution would be to only use "ignore_in_feed: updated" on the feeds
+# where you know they feed contains a "published" date for each item, and
+# don't use it for feeds that only contain an "updated" date for each item
+# -- but that can be tedious.
+#
+# So use this plugin instead
+#
+# This plugin will replace the "updated" and "published" dates of every item
+# with whichever of the two values is the lowest, unless the item is already
+# in the cache, in which case it will use the "updated" date from the item in
+# the cache -- making it a safe alternative to "ignore_in_feed: updated" for
+# all feeds regardless of whether the items have a "published" date or not,
+# and regardless of whether the ones that do have a "published" date try to
+# modify it or not.
+#
+###########################################################################
+
+import sys, time, os
+from xml.dom import minidom
+import planet
+from planet import reconstitute
+from planet import config
+from planet.reconstitute import date
+from planet.spider import filename
+
+log = planet.logger
+
+# finds the first descendent element that matches the specified
+# namespace and tag name, parses it (in canonical date format),
+# returns the parsed value, and removes (all of the) element(s)
+def parseAndPurgeDateElement(element, ns, tagName):
+ result = None
+ # see if we have any date(s?)
+ kids = element.getElementsByTagNameNS(ns, tagName)
+ if kids:
+ # record the first one
+ result = time.strptime(kids[0].childNodes[0].nodeValue,
+ '%Y-%m-%dT%H:%M:%SZ')
+ # get rid of all of them
+ for trash in kids:
+ trash.parentNode.removeChild(trash)
+ return result
+
+
+# given the identifier of an entry in the cache, fetches the
+# formated mtime of that entry (which should match the updated
+# date if venus has done it's job right
+#
+# returns None if the entry is not in the cache
+def getDateFromCache(entry):
+ if entry is None:
+ log.error("Attempted to lookup the date of 'None'")
+ return None
+
+ id = entry.getElementsByTagNameNS(atomNS, 'id')[0].childNodes[0].nodeValue
+ if id is None:
+ log.error("Unable to find id in entry")
+ return None
+
+ cache = os.path.join(config.cache_directory())
+ file = filename(cache, id)
+ if os.path.exists(file):
+ return time.gmtime(os.stat(file).st_mtime)
+ return None
+
+
+atomNS = 'http://www.w3.org/2005/Atom'
+planetNS = 'http://planet.intertwingly.net/'
+
+# parse input stream
+dom = minidom.parse(sys.stdin)
+
+entries = dom.getElementsByTagNameNS(atomNS, 'entry')
+for e in entries:
+
+ # get & remove our dates from the entry
+ updatedDate = parseAndPurgeDateElement(e, atomNS, 'updated')
+ pubDate = parseAndPurgeDateElement(e, atomNS, 'published')
+
+ cacheDate = getDateFromCache(e)
+
+ if cacheDate is not None:
+ mainDate = cacheDate
+ elif not updatedDate:
+ mainDate = pubDate
+ elif not pubDate:
+ mainDate = updatedDate
+ elif pubDate < updatedDate:
+ mainDate = pubDate
+ else:
+ mainDate = updatedDate
+
+ # add back to the entry
+ reconstitute.date(e, 'published', mainDate)
+ reconstitute.date(e, 'updated', mainDate)
+
+# output the dom
+print dom.toxml('utf-8')
View
9 planet/spider.py
@@ -235,6 +235,15 @@ def writeCache(feed_uri, feed_info, data):
if os.path.exists(cache_file): os.remove(cache_file)
continue
+ # re-set mtime incase filters have modified it
+ try:
+ edoc = feedparser.parse(output)
+ mtime = calendar.timegm(edoc.entries[0].updated_parsed)
+ except:
+ log.warning("Unable to re-set mtime on %s after running filters: ",
+ entry.id,
+ sys.exc_info()[0])
+
# write out and timestamp the results
write(output, cache_file, mtime)
View
5 planet/vendor/feedparser.py
@@ -1982,6 +1982,7 @@ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0
sValue = bNormalize and self.normalize(sValue) or sValue.strip()
if (not sValue) and (iPropertyType == self.URI):
if sNodeName == 'a': sValue = elmResult.get('href')
+ elif sNodeName == 'iframe': sValue = elmResult.get('src')
elif sNodeName == 'img': sValue = elmResult.get('src')
elif sNodeName == 'object': sValue = elmResult.get('data')
if sValue:
@@ -2339,7 +2340,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
- 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
+ 'iframe', 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
@@ -2355,7 +2356,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
- 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
+ 'form', 'frame', 'frameborder', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
View
4 planet/vendor/html5lib/sanitizer.py
@@ -13,7 +13,7 @@ class HTMLSanitizerMixin(object):
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4',
- 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'keygen', 'kbd',
+ 'h5', 'h6', 'hr', 'i', 'iframe', 'img', 'input', 'ins', 'keygen', 'kbd',
'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol',
'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre',
'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound',
@@ -43,7 +43,7 @@ class HTMLSanitizerMixin(object):
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
- 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
+ 'face', 'for', 'form', 'frame', 'frameborder', 'galleryimg', 'gutter', 'headers',
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
View
42 tests/data/filter/coercedates/a-rss-1.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="utf-8"?>
+<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<channel>
+<title>Fake RSS Blog</title>
+<link>http://fake.url.example.com</link>
+<description>Fake RSS Feed For testing</description>
+<image>
+<url>http://fake.url.example.com/feedlogo.gif</url>
+<title>Test RSS Feed</title>
+<link>http://fake.url.example.com</link>
+</image>
+<language>en-us</language>
+<copyright>Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution.</copyright>
+<generator>Blogsmith http://www.blogsmith.com/</generator>
+
+<item>
+ <title>Fake Title: RSS Has No Date</title>
+ <link>http://fake.url.example.com/rss-no-date</link>
+ <guid isPermaLink="true">http://fake.url.example.com/rss-no-date</guid>
+ <comments>http://fake.url.example.com/rss-no-date#comments</comments>
+ <description>
+ <![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
+ </description>
+ <imageurl>http://fake.url.example.com/rss-no-date.gif</imageurl>
+ <dc:creator>Fake Person</dc:creator>
+</item>
+
+
+<item>
+ <title>Fake Title: RSS Has Changing Date</title>
+ <link>http://fake.url.example.com/rss-changing-date</link>
+ <guid isPermaLink="true">http://fake.url.example.com/rss-changing-date</guid>
+ <comments>http://fake.url.example.com/rss-changing-date#comments</comments>
+ <description>
+ <![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
+ </description>
+ <imageurl>http://fake.url.example.com/rss-changing-date.gif</imageurl>
+ <dc:creator>Fake Person</dc:creator>
+ <dc:date>2011-12-01T11:00:00+00:00</dc:date>
+</item>
+
+</channel></rss>
View
42 tests/data/filter/coercedates/a-rss-2.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="utf-8"?>
+<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<channel>
+<title>Fake RSS Blog</title>
+<link>http://fake.url.example.com</link>
+<description>Fake RSS Feed For testing</description>
+<image>
+<url>http://fake.url.example.com/feedlogo.gif</url>
+<title>Test RSS Feed</title>
+<link>http://fake.url.example.com</link>
+</image>
+<language>en-us</language>
+<copyright>Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution.</copyright>
+<generator>Blogsmith http://www.blogsmith.com/</generator>
+
+<item>
+ <title>Fake Title: RSS Has No Date</title>
+ <link>http://fake.url.example.com/rss-no-date</link>
+ <guid isPermaLink="true">http://fake.url.example.com/rss-no-date</guid>
+ <comments>http://fake.url.example.com/rss-no-date#comments</comments>
+ <description>
+ <![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
+ </description>
+ <imageurl>http://fake.url.example.com/rss-no-date.gif</imageurl>
+ <dc:creator>Fake Person</dc:creator>
+</item>
+
+
+<item>
+ <title>Fake Title: RSS Has Changing Date</title>
+ <link>http://fake.url.example.com/rss-changing-date</link>
+ <guid isPermaLink="true">http://fake.url.example.com/rss-changing-date</guid>
+ <comments>http://fake.url.example.com/rss-changing-date#comments</comments>
+ <description>
+ <![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
+ </description>
+ <imageurl>http://fake.url.example.com/rss-changing-date.gif</imageurl>
+ <dc:creator>Fake Person</dc:creator>
+ <dc:date>2011-12-07T11:07:07+00:00</dc:date>
+</item>
+
+</channel></rss>
View
92 tests/data/filter/coercedates/b-atom-1.xml
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<feed
+ xmlns="http://www.w3.org/2005/Atom"
+ xmlns:thr="http://purl.org/syndication/thread/1.0"
+ xml:lang="en"
+ xml:base="http://fake.url.example.com/wp-atom.php"
+ >
+ <title type="text">Fake Atom Feed</title>
+ <subtitle type="text">Fake Atom feed for testing stuff</subtitle>
+
+ <updated>2011-12-08T00:00:28Z</updated>
+
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com" />
+ <id>http://fake.url.example.com/feed/atom/</id>
+ <link rel="self" type="application/atom+xml" href="http://fake.url.example.com/feed/atom/" />
+
+ <generator uri="http://wordpress.org/" version="3.2.1">WordPress</generator>
+
+
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Changing Updated Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-changing-updated" />
+ <id>http://fake.url.example.com/atom-changing-updated</id>
+ <updated>2011-12-05T10:06:38Z</updated>
+ <published>2011-11-09T00:00:28Z</published>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-changing-updated"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-changing-updated#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-changing-updated/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Changing Published Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-changing-published" />
+ <id>http://fake.url.example.com/atom-changing-published</id>
+ <published>2011-12-08T02:02:28Z</published>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-changing-published"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-changing-published#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-changing-published/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom No Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-no-date" />
+ <id>http://fake.url.example.com/atom-no-date</id>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-no-date"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-no-date#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-no-date/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Update Before Published]]></title>
+ <updated>2011-11-11T11:11:11Z</updated>
+ <published>2011-12-12T12:12:12Z</published>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-update-before-pub" />
+ <id>http://fake.url.example.com/atom-update-before-pub</id>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-update-before-pub"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-update-before-pub#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-update-before-pub/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+
+</feed>
View
92 tests/data/filter/coercedates/b-atom-2.xml
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<feed
+ xmlns="http://www.w3.org/2005/Atom"
+ xmlns:thr="http://purl.org/syndication/thread/1.0"
+ xml:lang="en"
+ xml:base="http://fake.url.example.com/wp-atom.php"
+ >
+ <title type="text">Fake Atom Feed</title>
+ <subtitle type="text">Fake Atom feed for testing stuff</subtitle>
+
+ <updated>2011-12-08T00:00:28Z</updated>
+
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com" />
+ <id>http://fake.url.example.com/feed/atom/</id>
+ <link rel="self" type="application/atom+xml" href="http://fake.url.example.com/feed/atom/" />
+
+ <generator uri="http://wordpress.org/" version="3.2.1">WordPress</generator>
+
+
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Changing Updated Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-changing-updated" />
+ <id>http://fake.url.example.com/atom-changing-updated</id>
+ <updated>2011-12-07T07:07:37Z</updated>
+ <published>2011-11-09T00:00:28Z</published>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-changing-updated"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-changing-updated#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-changing-updated/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Changing Published Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-changing-published" />
+ <id>http://fake.url.example.com/atom-changing-published</id>
+ <published>2011-12-13T13:13:13Z</published>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-changing-published"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-changing-published#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-changing-published/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom No Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-no-date" />
+ <id>http://fake.url.example.com/atom-no-date</id>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-no-date"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-no-date#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-no-date/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Update Before Published]]></title>
+ <updated>2009-09-09T09:09:09Z</updated>
+ <published>2011-12-12T12:12:12Z</published>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-update-before-pub" />
+ <id>http://fake.url.example.com/atom-update-before-pub</id>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-update-before-pub"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-update-before-pub#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-update-before-pub/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+
+</feed>
View
7 tests/data/filter/coercedates/config.ini
@@ -0,0 +1,7 @@
+[Planet]
+name = test planet
+cache_directory = tests/work/coercedates/cache
+cache_blasklist_directory = tests/work/coercedates/cache/blacklist
+
+filters: coercedates.plugin
+
View
106 tests/test_filter_coercedates.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+import unittest, os, glob, calendar, shutil, time
+from planet.spider import filename, spiderPlanet, writeCache
+from planet import feedparser, config
+import planet
+
+workdir = 'tests/work/coercedates/cache'
+testfeed = 'tests/data/filter/coercedates/%s.xml'
+configfile = 'tests/data/filter/coercedates/config.ini'
+
+class CoerceDatesTest(unittest.TestCase):
+ def setUp(self):
+ # silence errors
+ self.original_logger = planet.logger
+ # planet.getLogger('CRITICAL',None)
+
+ try:
+ os.makedirs(workdir)
+ except:
+ self.tearDown()
+ os.makedirs(workdir)
+
+ def tearDown(self):
+ shutil.rmtree(workdir)
+ os.removedirs(os.path.split(workdir)[0])
+ planet.logger = self.original_logger
+
+ def spiderFeed(self, feed_uri):
+ feed_info = feedparser.parse('<feed/>')
+ data = feedparser.parse(feed_uri)
+ writeCache(feed_uri, feed_info, data)
+
+ # no expected_date means we don't know what it should be yet
+ def verify_date(self, id, expected_date = None):
+
+ file = os.path.join(workdir, id)
+
+ # verify that the file exists
+ self.assertTrue(os.path.exists(file), msg=file);
+
+ data = feedparser.parse(file)
+
+ # verify published & updated dates are in sync and match expected
+
+ self.assertEqual(data.entries[0].updated,
+ data.entries[0].published)
+
+ # verify mtime is in sync
+ self.assertEqual(time.gmtime(os.stat(file).st_mtime),
+ data.entries[0].updated_parsed)
+ self.assertEqual(time.gmtime(os.stat(file).st_mtime),
+ data.entries[0].published_parsed)
+
+ # verify meet hardcoded expectations
+ if expected_date is not None:
+ self.assertEqual(expected_date,
+ data.entries[0].updated)
+
+ return data.entries[0].updated
+
+ def test_coerce_rss(self):
+ config.load(configfile)
+
+ # load first version of RSS
+ self.spiderFeed(testfeed % 'a-rss-1')
+
+ rss_no_date_expected = self.verify_date('fake.url.example.com,rss-no-date')
+ self.verify_date('fake.url.example.com,rss-changing-date',
+ u'2011-12-01T11:00:00Z')
+
+ # parse updated RSS feed
+ self.spiderFeed(testfeed % 'a-rss-2')
+
+ # verify dates haven't changed
+ self.verify_date('fake.url.example.com,rss-no-date',
+ rss_no_date_expected)
+ self.verify_date('fake.url.example.com,rss-changing-date',
+ u'2011-12-01T11:00:00Z')
+
+ def test_coerce_atom(self):
+ config.load(configfile)
+
+ # load first version of Atom
+ self.spiderFeed(testfeed % 'b-atom-1')
+
+ atom_no_date_expected = self.verify_date('fake.url.example.com,atom-no-date')
+ self.verify_date('fake.url.example.com,atom-changing-published',
+ u'2011-12-08T02:02:28Z')
+ self.verify_date('fake.url.example.com,atom-changing-updated',
+ u'2011-11-09T00:00:28Z')
+ self.verify_date('fake.url.example.com,atom-update-before-pub',
+ u'2011-11-11T11:11:11Z')
+
+ # parse updated Atom feed
+ self.spiderFeed(testfeed % 'b-atom-2')
+
+ # verify dates haven't changed
+ self.verify_date('fake.url.example.com,atom-no-date',
+ atom_no_date_expected)
+ self.verify_date('fake.url.example.com,atom-changing-published',
+ u'2011-12-08T02:02:28Z')
+ self.verify_date('fake.url.example.com,atom-changing-updated',
+ u'2011-11-09T00:00:28Z')
+ self.verify_date('fake.url.example.com,atom-update-before-pub',
+ u'2011-11-11T11:11:11Z')