Permalink
Browse files

Sync with trunk.

  • Loading branch information...
2 parents cfce4dd + 236b3c2 commit da56f78d70e4db118ccff8902435dc81103d7719 @distler distler committed Nov 11, 2007
Showing with 3,759 additions and 1,133 deletions.
  1. +1 −1 README
  2. +4 −3 docs/config.html
  3. +2 −2 docs/filters.html
  4. +2 −2 docs/normalization.html
  5. +6 −1 examples/opml-top100.ini
  6. +10 −4 filters/excerpt.py
  7. +1 −1 filters/html2xhtml.plugin
  8. +14 −9 filters/mememe.plugin
  9. +36 −0 filters/minhead.py
  10. +24 −0 filters/xhtml2html.plugin
  11. +0 −5 filters/xhtml2html.py
  12. +8 −4 planet/__init__.py
  13. +22 −14 planet/config.py
  14. +29 −0 planet/csv_config.py
  15. +1 −2 planet/expunge.py
  16. +4 −3 planet/foaf.py
  17. +0 −42 planet/html5lib/treebuilders/__init__.py
  18. +0 −5 planet/html5lib/treebuilders/etree.py
  19. +0 −227 planet/html5lib/treebuilders/etreefull.py
  20. +3 −1 planet/idindex.py
  21. +12 −7 planet/reconstitute.py
  22. +1 −1 planet/shell/__init__.py
  23. +3 −1 planet/shell/dj.py
  24. +12 −5 planet/shell/tmpl.py
  25. +1 −0 planet/shell/xslt.py
  26. +59 −11 planet/spider.py
  27. +2 −2 planet/splice.py
  28. 0 planet/{ → vendor}/compat_logging/__init__.py
  29. 0 planet/{ → vendor}/compat_logging/config.py
  30. 0 planet/{ → vendor}/compat_logging/handlers.py
  31. +34 −10 planet/{ → vendor}/feedparser.py
  32. 0 planet/{ → vendor}/html5lib/__init__.py
  33. +301 −162 planet/{ → vendor}/html5lib/constants.py
  34. 0 planet/vendor/html5lib/filters/__init__.py
  35. +10 −0 planet/vendor/html5lib/filters/_base.py
  36. +63 −0 planet/vendor/html5lib/filters/inject_meta_charset.py
  37. +88 −0 planet/vendor/html5lib/filters/lint.py
  38. +175 −0 planet/vendor/html5lib/filters/optionaltags.py
  39. +41 −0 planet/vendor/html5lib/filters/whitespace.py
  40. +320 −201 planet/{ → vendor}/html5lib/html5parser.py
  41. +193 −79 planet/{ → vendor}/html5lib/inputstream.py
  42. +51 −25 planet/{ → vendor}/html5lib/liberalxmlparser.py
  43. +202 −0 planet/vendor/html5lib/sanitizer.py
  44. +3 −0 planet/vendor/html5lib/serializer/__init__.py
  45. +218 −0 planet/vendor/html5lib/serializer/htmlserializer.py
  46. +9 −0 planet/vendor/html5lib/serializer/xhtmlserializer.py
  47. +391 −179 planet/{ → vendor}/html5lib/tokenizer.py
  48. +65 −0 planet/vendor/html5lib/treebuilders/__init__.py
  49. +7 −3 planet/{ → vendor}/html5lib/treebuilders/_base.py
  50. +16 −43 planet/{ → vendor}/html5lib/treebuilders/dom.py
  51. +266 −0 planet/vendor/html5lib/treebuilders/etree.py
  52. +13 −36 planet/{ → vendor}/html5lib/treebuilders/simpletree.py
  53. +158 −0 planet/vendor/html5lib/treebuilders/soup.py
  54. +52 −0 planet/vendor/html5lib/treewalkers/__init__.py
  55. +154 −0 planet/vendor/html5lib/treewalkers/_base.py
  56. +37 −0 planet/vendor/html5lib/treewalkers/dom.py
  57. +112 −0 planet/vendor/html5lib/treewalkers/etree.py
  58. +67 −0 planet/vendor/html5lib/treewalkers/genshistream.py
  59. +52 −0 planet/vendor/html5lib/treewalkers/pulldom.py
  60. +72 −0 planet/vendor/html5lib/treewalkers/simpletree.py
  61. +36 −0 planet/vendor/html5lib/treewalkers/soup.py
  62. 0 planet/{ → vendor}/html5lib/utils.py
  63. 0 planet/{ → vendor}/htmltmpl.py
  64. 0 planet/{ → vendor}/httplib2/__init__.py
  65. 0 planet/{ → vendor}/httplib2/iri2uri.py
  66. 0 planet/{ → vendor}/portalocker.py
  67. 0 planet/{ → vendor}/timeoutsocket.py
  68. +3 −3 tests/data/apply/config-html.ini
  69. +3 −0 tests/data/config/basic.csv
  70. +7 −0 tests/data/config/rlist-config.ini
  71. +7 −0 tests/data/config/rlist-csv.ini
  72. +6 −0 tests/data/config/subconfig.ini
  73. +1 −1 tests/data/filter/django/test.xml
  74. +1 −0 tests/data/filter/excerpt-lorem-ipsum.ini
  75. +3 −0 tests/data/filter/minhead.ini
  76. +3 −0 tests/data/filter/minhead.xml
  77. +13 −0 tests/data/filter/tmpl/content_xhtml2.xml
  78. +13 −0 tests/data/reconstitute/author_noname.xml
  79. +11 −0 tests/data/reconstitute/category_blank_term.xml
  80. +1 −1 tests/data/reconstitute/category_label.xml
  81. +1 −1 tests/data/reconstitute/category_scheme.xml
  82. +10 −0 tests/data/reconstitute/empty_title.xml
  83. +9 −0 tests/data/reconstitute/missing_title.xml
  84. +41 −0 tests/data/spider/testfeed4.atom
  85. +3 −3 tests/reconstitute.py
  86. +14 −10 tests/test_apply.py
  87. +25 −0 tests/test_config_csv.py
  88. +2 −1 tests/test_expunge.py
  89. +2 −2 tests/test_filter_django.py
  90. +0 −7 tests/test_filter_genshi.py
  91. +22 −0 tests/test_filters.py
  92. +5 −5 tests/test_idindex.py
  93. +9 −1 tests/test_scrub.py
  94. +10 −1 tests/test_spider.py
  95. +8 −0 tests/test_subconfig.py
  96. +4 −0 themes/asf/default.css
  97. +43 −5 themes/asf/index.html.xslt
  98. +16 −1 themes/common/opml.xml.xslt
View
2 README
@@ -9,7 +9,7 @@ also actively being maintained.
It uses Mark Pilgrim's Universal Feed Parser to read from CDF, RDF, RSS and
Atom feeds; Leonard Richardson's Beautiful Soup to correct markup issues;
-and either Tomas Styblo's templating engine Daniel Viellard's implementation
+and either Tomas Styblo's templating engine or Daniel Viellard's implementation
of XSLT to output static files in any format you can dream up.
To get started, check out the documentation in the docs directory. If you have
View
@@ -139,9 +139,10 @@ <h3 id="subscription"><code>[</code><em>subscription</em><code>]</code></h3>
the predefined themes presume that at least <code>name</code> is defined.</p>
<p>The <code>content_type</code> parameter can be defined to indicate that
this subscription is a <em>reading list</em>, i.e., is an external list
-of subscriptions. At the moment, two formats of reading lists are supported:
-<code>opml</code> and <code>foaf</code>. In the future, support for formats
-like <code>xoxo</code> could be added.</p>
+of subscriptions. At the moment, three formats of reading lists are supported:
+<code>opml</code>, <code>foaf</code>, <code>csv</code>, and
+<code>config</code>. In the future,
+support for formats like <code>xoxo</code> could be added.</p>
<p><a href="normalization.html#overrides">Normalization overrides</a> can
also be defined here.</p>
View
@@ -84,8 +84,8 @@
through the specified filter and the output is planced into the named file; the
other unmodified branch continues onto the next filter, if any.
One use case for this function is to use
-<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and
-an HTML output stream from one source.</li>
+<a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
+and an HTML output stream from one source.</li>
<li>Templates written using htmltmpl or django currently only have access to a
fixed set of fields, whereas XSLT and genshi templates have access to
View
@@ -80,8 +80,8 @@ <h3 id="overrides">Overrides</h3>
<ul>
<li><code>ignore_in_feed</code> allows you to list any number of elements
or attributes which are to be ignored in feeds. This is often handy in the
-case of feeds where the <code>id</code>, <code>updated</code> or
-<code>xml:lang</code> values can't be trusted.</li>
+case of feeds where the <code>author</code>, <code>id</code>,
+<code>updated</code> or <code>xml:lang</code> values can't be trusted.</li>
<li><code>title_type</code>, <code>summary_type</code>,
<code>content_type</code> allow you to override the
<a href="http://www.feedparser.org/docs/reference-entry-title_detail.html#reference.entry.title_detail.type"><code>type</code></a>
View
@@ -31,13 +31,18 @@ activity_threshold = 90
# filters to be run
filters = excerpt.py
+bill_of_materials:
+ .htaccess
+ favicon.ico
+ robots.txt
+
# filter parameters
[excerpt.py]
omit = img p br
width = 500
# add memes to output
-[index.html.tmpl]
+[index.html.xslt]
filters = mememe.plugin
[mememe.plugin]
View
@@ -4,6 +4,7 @@
Parameters:
width: maximum number of characters in the excerpt. Default: 500
omit: whitespace delimited list of html tags to remove. Default: none
+ target: name of element created. Default: planet:excerpt
Notes:
* if 'img' is in the list of tags to be omitted <img> tags are replaced with
@@ -23,6 +24,7 @@
wrapper = textwrap.TextWrapper(width=int(args.get('width','500')))
omit = args.get('omit', '').split()
+target = args.get('target', 'planet:excerpt')
class copy:
""" recursively copy a source to a target, up to a given width """
@@ -94,10 +96,14 @@ def copyText(self, source, target):
# if present, recursively copy it to a planet:excerpt element
if source:
- dom.documentElement.setAttribute('xmlns:planet', planetNS)
- target = dom.createElementNS(planetNS, 'planet:excerpt')
- source[0].parentNode.appendChild(target)
- copy(dom, source[0], target)
+ if target.startswith('planet:'):
+ dom.documentElement.setAttribute('xmlns:planet', planetNS)
+ if target.startswith('atom:'): target = target.split(':',1)[1]
+ excerpt = dom.createElementNS(planetNS, target)
+ source[0].parentNode.appendChild(excerpt)
+ copy(dom, source[0], excerpt)
+ if source[0].nodeName == excerpt.nodeName:
+ source[0].parentNode.removeChild(source[0])
# print out results
print dom.toxml('utf-8')
@@ -1,5 +1,5 @@
import sys
-from planet import html5lib
+import html5lib
tree=html5lib.treebuilders.dom.TreeBuilder
parser = html5lib.html5parser.HTMLParser(tree=tree)
document = parser.parse(sys.stdin)
View
@@ -23,9 +23,10 @@ from xml.sax.saxutils import escape
from htmlentitydefs import entitydefs
import planet
-from planet import config, feedparser
+from planet import config
from planet.spider import filename
-log = planet.getLogger(config.log_level(),config.log_format())
+import feedparser
+log = planet.logger
options = config.filter_options(sys.argv[0])
MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
@@ -64,6 +65,7 @@ def cache_meme(url, headers):
file.close()
urlmap = {}
+revmap = {}
def canonicalize(url):
url = urlmap.get(url,url)
parts = list(urlparse.urlparse(url))
@@ -73,7 +75,10 @@ def canonicalize(url):
if parts[1].startswith('www.'): parts[1]=parts[1][4:]
if not parts[2]: parts[2] = '/'
parts[-1] = ''
- return urlparse.urlunparse(parts)
+
+ canonurl = urlparse.urlunparse(parts)
+ revmap[canonurl] = url
+ return canonurl
log.debug("Loading cached data")
for name in glob.glob(os.path.join(cache, '*')):
@@ -125,7 +130,7 @@ for name in glob.glob(os.path.join(cache, '*')):
# identify the unique links
entry_links = []
- for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
+ for node in doc.xpathEval("//*[@href and not(@rel='source') and not(@rel='license')]"):
parent = node.parent
while parent:
if parent.name == 'source': break
@@ -309,7 +314,7 @@ meme_feed.newTextChild(None, 'updated',
# parse the input
log.debug("Parse input")
-doc=libxml2.parseDoc(sys.stdin.read())
+doc=libxml2.readDoc(sys.stdin.read(), '', 'utf-8', libxml2.XML_PARSE_NONET)
# find the sidebar/footer
sidebar = options.get('sidebar','//*[@class="sidebar"]')
@@ -340,7 +345,7 @@ while child:
if not title: continue
li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a', title)
- a.setProp('href', entry)
+ a.setProp('href', revmap.get(entry,entry))
link_count = link_count + 1
if link_count >= 10: break
if link_count > 0: state = None
@@ -388,7 +393,7 @@ for i in range(0,len(weighted_links)):
# otherwise, parse the html
if not title:
- title = html(link).title
+ title = html(revmap.get(link,link)).title
# dehtmlize
title = re.sub('&(\w+);',
@@ -421,7 +426,7 @@ for i in range(0,len(weighted_links)):
# main link
a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
- a.setProp('href',link)
+ a.setProp('href',revmap.get(link,link))
if (((i==0) or (updated>=weighted_links[i-1][2])) and
(i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
rank = 0
@@ -437,7 +442,7 @@ for i in range(0,len(weighted_links)):
if entry in voters: continue
li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a' , author)
- a.setProp('href',entry)
+ a.setProp('href',revmap.get(entry,entry))
if title: a.setProp('title',title)
voters.append(entry)
View
@@ -0,0 +1,36 @@
+#
+# Ensure that all headings are below a permissible maximum (like h3).
+# If not, all heading levels will be changed to conform.
+# Note: this may create "illegal" heading levels, like h7 and beyond.
+#
+
+import sys
+from xml.dom import minidom, XHTML_NAMESPACE
+
+# determine permissible minimimum heading
+if '--min' in sys.argv:
+ minhead = int(sys.argv[sys.argv.index('--min')+1])
+else:
+ minhead=3
+
+# parse input stream
+doc = minidom.parse(sys.stdin)
+
+# search for headings below the permissable minimum
+first=minhead
+for i in range(1,minhead):
+ if doc.getElementsByTagName('h%d' % i):
+ first=i
+ break
+
+# if found, bump all headings so that the top is the permissible minimum
+if first < minhead:
+ for i in range(6,0,-1):
+ for oldhead in doc.getElementsByTagName('h%d' % i):
+ newhead = doc.createElementNS(XHTML_NAMESPACE, 'h%d' % (i+minhead-first))
+ for child in oldhead.childNodes:
+ newhead.appendChild(child)
+ oldhead.parentNode.replaceChild(newhead, oldhead)
+
+# return (possibly modified) document
+print doc.toxml('utf-8')
View
@@ -0,0 +1,24 @@
+# Example usages:
+#
+# filters:
+# xhtml2html.plugin?quote_attr_values=True&quote_char="'"
+#
+# -- or --
+#
+# [xhtml2html.plugin]
+# quote_attr_values=True
+# quote_char="'"
+
+import sys
+opts = {}
+for name,value in zip(sys.argv[1::2],sys.argv[2::2]):
+ name = name.lstrip('-')
+ try: opts[name] = eval(value)
+ except: opts[name] = value
+
+from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
+parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
+serializer = serializer.HTMLSerializer(**dict(opts))
+for text in serializer.serialize(tokens, encoding='utf-8'):
+ sys.stdout.write(text)
View
@@ -1,5 +0,0 @@
-import sys
-from genshi.input import XMLParser
-from genshi.output import HTMLSerializer
-
-print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')
View
@@ -1,6 +1,7 @@
xmlns = 'http://planet.intertwingly.net/'
logger = None
+loggerParms = None
import os, sys, re
import config
@@ -11,8 +12,8 @@
def getLogger(level, format):
""" get a logger with the specified log level """
- global logger
- if logger: return logger
+ global logger, loggerParms
+ if logger and loggerParms == (level,format): return logger
try:
import logging
@@ -21,16 +22,19 @@ def getLogger(level, format):
import compat_logging as logging
logging.basicConfig(format=format)
- logging.getLogger().setLevel(logging.getLevelName(level))
logger = logging.getLogger("planet.runner")
+ logger.setLevel(logging.getLevelName(level))
try:
logger.warning
except:
logger.warning = logger.warn
+ loggerParms = (level,format)
return logger
+sys.path.insert(1, os.path.join(os.path.dirname(__file__),'vendor'))
+
# Configure feed parser
-from planet import feedparser
+import feedparser
feedparser.SANITIZE_HTML=0
feedparser.RESOLVE_RELATIVE_URIS=0
View
@@ -138,8 +138,10 @@ def load(config_file):
parser.read(config_file)
import config, planet
- from planet import opml, foaf
- log = planet.getLogger(config.log_level(),config.log_format())
+ from planet import opml, foaf, csv_config
+ log = planet.logger
+ if not log:
+ log = planet.getLogger(config.log_level(),config.log_format())
# Theme support
theme = config.output_theme()
@@ -191,18 +193,22 @@ def load(config_file):
os.makedirs(config.cache_lists_directory())
def data2config(data, cached_config):
- if content_type(list).find('opml')>=0:
- opml.opml2config(data, cached_config)
- elif content_type(list).find('foaf')>=0:
- foaf.foaf2config(data, cached_config)
- else:
- from planet import shell
- import StringIO
- cached_config.readfp(StringIO.StringIO(shell.run(
- content_type(list), data.getvalue(), mode="filter")))
+ if content_type(list).find('opml')>=0:
+ opml.opml2config(data, cached_config)
+ elif content_type(list).find('foaf')>=0:
+ foaf.foaf2config(data, cached_config)
+ elif content_type(list).find('csv')>=0:
+ csv_config.csv2config(data, cached_config)
+ elif content_type(list).find('config')>=0:
+ cached_config.readfp(data)
+ else:
+ from planet import shell
+ import StringIO
+ cached_config.readfp(StringIO.StringIO(shell.run(
+ content_type(list), data.getvalue(), mode="filter")))
- if cached_config.sections() in [[], [list]]:
- raise Exception
+ if cached_config.sections() in [[], [list]]:
+ raise Exception
for list in reading_lists:
downloadReadingList(list, parser, data2config)
@@ -344,7 +350,9 @@ def reading_lists():
for section in parser.sections():
if parser.has_option(section, 'content_type'):
type = parser.get(section, 'content_type')
- if type.find('opml')>=0 or type.find('foaf')>=0 or type.find('.')>=0:
+ if type.find('opml')>=0 or type.find('foaf')>=0 or \
+ type.find('csv')>=0 or type.find('config')>=0 or \
+ type.find('.')>=0:
result.append(section)
return result
View
@@ -0,0 +1,29 @@
+from ConfigParser import ConfigParser
+import csv
+
+# input = csv, output = ConfigParser
+def csv2config(input, config=None):
+
+ if not hasattr(input, 'read'):
+ input = csv.StringIO(input)
+
+ if not config:
+ config = ConfigParser()
+
+ reader = csv.DictReader(input)
+ for row in reader:
+ section = row[reader.fieldnames[0]]
+ config.add_section(section)
+ for name, value in row.items():
+ if value and name != reader.fieldnames[0]:
+ config.set(section, name, value)
+
+ return config
+
+if __name__ == "__main__":
+ # small main program which converts CSV into config.ini format
+ import sys, urllib
+ config = ConfigParser()
+ for input in sys.argv[1:]:
+ csv2config(urllib.urlopen(input), config)
+ config.write(sys.stdout)
Oops, something went wrong.

0 comments on commit da56f78

Please sign in to comment.