Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

MeMeme and html2xhtml plugins

  • Loading branch information...
commit a5e1fde287e83779aca83a9f4511fdd5d43dbbd1 1 parent ddf15fc
@rubys authored
View
1  .bzrignore
@@ -1,3 +1,4 @@
*.tmplc
.DS_Store
cache
+*.pluginc
View
23 docs/filters.html
@@ -8,12 +8,13 @@
<title>Venus Filters</title>
</head>
<body>
-<h2>Filters</h2>
-<p>Filters are simple Unix pipes. Input comes in <code>stdin</code>,
-parameters come from the config file, and output goes to <code>stdout</code>.
-Anything written to <code>stderr</code> is logged as an ERROR message. If no
-<code>stdout</code> is produced, the entry is not written to the cache or
-processed further; in fact, if the entry had previously been written to the cache, it will be removed.</p>
+<h2>Filters and Plugins</h2>
+<p>Filters and plugins are simple Unix pipes. Input comes in
+<code>stdin</code>, parameters come from the config file, and output goes to
+<code>stdout</code>. Anything written to <code>stderr</code> is logged as an
+ERROR message. If no <code>stdout</code> is produced, the entry is not written
+to the cache or processed further; in fact, if the entry had previously been
+written to the cache, it will be removed.</p>
<p>There are two types of filters supported by Venus, input and template.</p>
<p>Input to an input filter is a aggressively
@@ -89,6 +90,16 @@
<li>Templates written using htmltmpl or django currently only have access to a
fixed set of fields, whereas XSLT and genshi templates have access to
everything.</li>
+
+<li>Plugins differ from filters in that while filters are forked, plugins are
+<a href="http://docs.python.org/lib/module-imp.html">imported</a>. This
+means that plugins are limited to Python and are run in-process. Plugins
+therefore have direct access to planet internals like configuration and
+logging facitilies, as well as access to the bundled libraries like the
+<a href="http://feedparser.org/docs/">Universal Feed Parser</a> and
+<a href="http://code.google.com/p/html5lib/">html5lib</a>; but it also
+means that functions like <code>os.abort()</code> can't be recovered
+from.</li>
</ul>
</body>
</html>
View
2  docs/index.html
@@ -21,7 +21,7 @@
<ul>
<li><a href="venus.svg">Architecture</a></li>
<li><a href="normalization.html">Normalization</a></li>
-<li><a href="filters.html">Filters</a></li>
+<li><a href="filters.html">Filters and Plugins</a></li>
</ul>
</li>
<li>Other
View
7 examples/opml-top100.ini
@@ -36,6 +36,13 @@ filters = excerpt.py
omit = img p br
width = 500
+# add memes to output
+[index.html.tmpl]
+filters = mememe.plugin
+
+[mememe.plugin]
+sidebar = //*[@id="footer"]
+
# subscription list
[http://share.opml.org/opml/top100.opml]
content_type = opml
View
6 filters/html2xhtml.plugin
@@ -0,0 +1,6 @@
+import sys
+from planet import html5lib
+tree=html5lib.treebuilders.dom.TreeBuilder
+parser = html5lib.html5parser.HTMLParser(tree=tree)
+document = parser.parse(sys.stdin)
+sys.stdout.write(document.toxml("utf-8"))
View
475 filters/mememe.plugin
@@ -0,0 +1,475 @@
+#
+# This Venus output filter will annotate an XHTML page with a list of
+# "memes" (or most popular linked destinations, based on the last week
+# of entries from the cache) and will update the subscription list with
+# links to recent entries from each subscription.
+#
+# Templates that don't produce XHTML natively will need their output passed
+# through html2xhtml.plugin first.
+#
+# Typical configuration (based on classic_fancy):
+#
+# [index.html.tmpl]
+# filters:
+# html2xhtml.plugin
+# mememe.plugin
+#
+# [mememe.plugin]
+# sidebar = @class='sidebar'
+#
+
+import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
+from xml.sax.saxutils import escape
+from htmlentitydefs import entitydefs
+
+import planet
+from planet import config, feedparser
+from planet.spider import filename
+log = planet.getLogger(config.log_level(),config.log_format())
+options = config.filter_options(sys.argv[0])
+
+MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
+
+now = time.time()
+week = 7 * 86400
+week_ago = now - week
+
+cache = config.cache_directory()
+meme_cache = os.path.join(cache, 'memes')
+if not os.path.exists(meme_cache): os.makedirs(meme_cache)
+
+all_links = {}
+feed_links = {}
+
+def check_cache(url):
+ try:
+ file = open(filename(meme_cache, url))
+ headers = eval(file.read())
+ file.close()
+ return headers or {}
+ except:
+ return {}
+
+def cache_meme(url, headers):
+ json = []
+ for key,value in headers.items():
+ json.append(' %s: %s' % (toj(key), toj(value)))
+ file = open(filename(meme_cache, url),'w')
+ file.write('{\n' + ',\n'.join(json) + '\n}\n')
+ file.close()
+
+urlmap = {}
+def canonicalize(url):
+ url = urlmap.get(url,url)
+ parts = list(urlparse.urlparse(url))
+
+ parts[0] = parts[0].lower()
+ parts[1] = parts[1].lower()
+ if parts[1].startswith('www.'): parts[1]=parts[1][4:]
+ if not parts[2]: parts[2] = '/'
+ parts[-1] = ''
+ return urlparse.urlunparse(parts)
+
+log.debug("Loading cached data")
+for name in glob.glob(os.path.join(cache, '*')):
+ # ensure that this is within the past week
+ if os.path.isdir(name): continue
+ mtime = os.stat(name).st_mtime
+ if mtime < week_ago: continue
+
+ # parse the file
+ try:
+ doc = libxml2.parseFile(name)
+ except:
+ continue
+ xp = doc.xpathNewContext()
+ xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom")
+ xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/")
+
+ # determine the entry
+ entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']")
+ if not entry: continue
+ entry = canonicalize(entry[0].prop("href"))
+
+ # determine the title
+ title = xp.xpathEval("/atom:entry/atom:title")
+ if title:
+ if title[0].prop('type') == 'html':
+ title = re.sub('<.*?>','',title[0].content)
+ else:
+ title = title[0].content
+ title = str(title or '')
+
+ # determine the feed id
+ feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup")
+ if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id")
+ if not feed: continue
+ feed = feed[0].content
+
+ # determine the author
+ author = xp.xpathEval("/atom:entry/atom:source/planet:name")
+ if author:
+ author = author[0].content
+ else:
+ author = ''
+
+ # track the feed_links
+ if author:
+ if not feed_links.has_key(author): feed_links[author] = list()
+ feed_links[author].append([mtime, entry, title])
+
+ # identify the unique links
+ entry_links = []
+ for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
+ parent = node.parent
+ while parent:
+ if parent.name == 'source': break
+ parent = parent.parent
+ else:
+ link = canonicalize(node.prop('href'))
+ if not link in entry_links:
+ entry_links.append(link)
+ if node.hasProp('title') and node.prop('title').startswith('http'):
+ link = canonicalize(node.prop('title'))
+ if not link in entry_links:
+ entry_links.append(link)
+
+ # add the votes
+ weight = 1.0 - (now - mtime)**2 / week**2
+ vote = [(weight, str(entry), str(feed), title, author, mtime)]
+ for link in entry_links:
+ all_links[link] = all_links.get(link,list()) + vote
+
+ # free the entry
+ doc.freeDoc()
+
+# tally the votes
+weighted_links = []
+for link, votes in all_links.items():
+ site = {}
+ updated = 0
+ for weight, entry, feed, title, author, mtime in votes:
+ site[feed] = max(site.get(feed,0), weight)
+ if mtime > updated: updated=mtime
+ weighted_links.append((sum(site.values()), link, updated))
+weighted_links.sort()
+weighted_links.reverse()
+
+cp1252 = {
+ 128: 8364, # euro sign
+ 130: 8218, # single low-9 quotation mark
+ 131: 402, # latin small letter f with hook
+ 132: 8222, # double low-9 quotation mark
+ 133: 8230, # horizontal ellipsis
+ 134: 8224, # dagger
+ 135: 8225, # double dagger
+ 136: 710, # modifier letter circumflex accent
+ 137: 8240, # per mille sign
+ 138: 352, # latin capital letter s with caron
+ 139: 8249, # single left-pointing angle quotation mark
+ 140: 338, # latin capital ligature oe
+ 142: 381, # latin capital letter z with caron
+ 145: 8216, # left single quotation mark
+ 146: 8217, # right single quotation mark
+ 147: 8220, # left double quotation mark
+ 148: 8221, # right double quotation mark
+ 149: 8226, # bullet
+ 150: 8211, # en dash
+ 151: 8212, # em dash
+ 152: 732, # small tilde
+ 153: 8482, # trade mark sign
+ 154: 353, # latin small letter s with caron
+ 155: 8250, # single right-pointing angle quotation mark
+ 156: 339, # latin small ligature oe
+ 158: 382, # latin small letter z with caron
+ 159: 376} # latin capital letter y with diaeresis
+
+# determine the title for a given url
+class html(sgmllib.SGMLParser):
+ def __init__(self, url):
+ sgmllib.SGMLParser.__init__(self)
+ self.title = ""
+ self.feedurl = ""
+ self.intitle = False
+
+ headers = check_cache(url)
+
+ try:
+ # fetch the page
+ request = urllib2.Request(url)
+ request.add_header('User-Agent', 'Venus/MeMeme')
+ if headers.has_key('etag'):
+ request.add_header('If-None-Match', headers['etag'])
+ if headers.has_key('last_modified'):
+ request.add_header('If-Modified-Since', headers['last-modified'])
+ response = urllib2.urlopen(request)
+ self.feed(response.read())
+
+ # ensure the data is in utf-8
+ try:
+ self.title = self.title.decode('utf-8')
+ except:
+ self.title = ''.join([unichr(cp1252.get(ord(c),ord(c)))
+ for c in self.title.decode('iso-8859-1')])
+
+ # cache the results
+ headers = {}
+ if self.feedurl: headers['feedurl'] = self.feedurl
+ if self.title: headers['title'] = self.title
+ headers.update(response.headers)
+ cache_meme(url, headers)
+ except:
+ self.feedurl = headers.get('feedurl')
+ if headers.has_key('title'):
+ if isinstance(headers['title'],str):
+ self.title=eval('u'+repr(headers['title']).replace('\\\\','\\'))
+ else:
+ self.title=headers['title']
+
+ # if there is a feed, look for an entry that matches, and take that title
+ if self.feedurl and not self.title:
+ headers = check_cache(self.feedurl)
+ data = feedparser.parse(self.feedurl, etag=headers.get('etag'),
+ modified=headers.get('last-modified'))
+
+ if data.has_key('headers') and data.has_key('status') and \
+ data.status in [200, 301, 302]:
+
+ titles = {}
+ for entry in data.entries:
+ if entry.has_key('title_detail') and entry.has_key('link'):
+ titles[entry.link] = entry.title_detail.value
+ if entry.title_detail.type == 'text/plain':
+ titles[entry.link] = escape(titles[entry.link])
+
+ if titles.has_key(url): self.title = titles[url]
+
+ data.headers.update(titles)
+ cache_meme(self.feedurl, data.headers)
+ else:
+ if headers.has_key(url):
+ if isinstance(headers[url],str):
+ self.title=eval('u'+repr(headers[url]).replace('\\\\','\\'))
+ else:
+ self.title=headers[url]
+
+ # fallback is the basename of the URI
+ if not self.title:
+ self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0])
+
+ # parse out the first autodiscovery link
+ def start_link(self, attrs):
+ if self.feedurl: return
+ attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
+ if not 'rel' in attrs: return
+ rels = attrs['rel'].split(' ')
+ if 'alternate' not in rels: return
+ if not 'type' in attrs or not attrs['type'].endswith('xml'): return
+ if 'href' in attrs:
+ self.feedurl = attrs['href']
+
+ # parse the page title
+ def start_title(self, attributes):
+ if not self.title: self.intitle = True
+ def end_title(self):
+ self.intitle = False
+ def handle_data(self, text):
+ if self.intitle: self.title += escape(text)
+
+# convert unicode string to a json string
+def toj(value):
+ result = repr(value).replace(r'\x',r'\u00')
+ if result[:1] == 'u': result=result[1:]
+ if result.startswith("'"):
+ result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1]
+ return result
+
+seenit = []
+count = 0
+
+# construct an empty feed
+feed_doc = libxml2.newDoc("1.0")
+meme_feed = feed_doc.newChild(None, "feed", None)
+meme_feed.newNs('http://www.w3.org/2005/Atom', None)
+meme_feed.newTextChild(None, 'title', config.name() + ': Memes')
+author = meme_feed.newChild(None, 'author', None)
+author.newTextChild(None, 'name', config.owner_name())
+if config.owner_email: author.newTextChild(None, 'email', config.owner_email())
+meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom'))
+link = meme_feed.newChild(None, 'link', None)
+link.setProp('href', os.path.join(config.link(), 'memes.atom'))
+link.setProp('rel', 'self')
+meme_feed.newTextChild(None, 'updated',
+ time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()))
+
+# parse the input
+log.debug("Parse input")
+doc=libxml2.parseDoc(sys.stdin.read())
+
+# find the sidebar/footer
+sidebar = options.get('sidebar','//*[@class="sidebar"]')
+footer = doc.xpathEval(sidebar)
+if not hasattr(footer,'__len__') or len(footer) == 0:
+ raise Exception(sidebar + ' not found')
+if len(footer) > 1:
+ log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar))
+footer = footer[0]
+
+# add up to 10 entry links to each subscription
+subs_ul = footer.children
+while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next
+child = subs_ul.children
+while child:
+ if child.name == 'li':
+ if child.lastChild().name == 'ul': child.lastChild().unlinkNode()
+ link = child.lastChild()
+ while link.isText(): link=link.prev
+ author = link.getContent()
+ state = 'inactive'
+ if feed_links.has_key(author):
+ ul2 = child.newChild(None, 'ul', None)
+ feed_links[author].sort()
+ feed_links[author].reverse()
+ link_count = 0
+ for mtime, entry, title in feed_links[author]:
+ if not title: continue
+ li2 = ul2.newChild(None, 'li', None)
+ a = li2.newTextChild(None, 'a', title)
+ a.setProp('href', entry)
+ link_count = link_count + 1
+ if link_count >= 10: break
+ if link_count > 0: state = None
+ if state:
+ link.setProp('class',((link.prop('class') or '') + ' ' + state).strip())
+ child=child.next
+
+# create a h2 and ul for the memes list
+footer_top = footer.children
+memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes '))
+memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None))
+
+# create a header for the memes list
+a = memes.newChild(None, 'a', None)
+a.setProp('href', 'memes.atom')
+img = a.newChild(None, 'img', None)
+img.setProp('src', 'images/feed-icon-10x10.png')
+
+# collect the results
+log.debug("Fetch titles and collect the results")
+from urllib import quote_plus
+for i in range(0,len(weighted_links)):
+ weight, link, updated = weighted_links[i]
+
+ # ensure that somebody new points to this entry. This guards against
+ # groups of related links which several posts point to all.
+ novel = False
+ for weight, entry, feed, title, author, mtime in all_links[link]:
+ if entry not in seenit:
+ seenit.append(entry)
+ novel = True
+ if not novel: continue
+
+ all_links[link].sort()
+ all_links[link].reverse()
+ cache_file = filename(cache, link)
+ title = None
+
+ # when possible, take the title from the cache
+ if os.path.exists(cache_file):
+ entry = feedparser.parse(cache_file).entries[0]
+ if entry.has_key('title_detail'):
+ title = entry.title_detail.value
+ if entry.title_detail.type == 'text/plain': title = escape(title)
+
+ # otherwise, parse the html
+ if not title:
+ title = html(link).title
+
+ # dehtmlize
+ title = re.sub('&(\w+);',
+ lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title)
+ title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title)
+ title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
+
+ # title too long? Insert zero width spaces where appropriate
+ if max(map(len,title.split())) > 30:
+ title=re.sub('(\W+)',u'\\1\u200b',title)
+
+ # save the entry title (it is used later)
+ entry_title = title.strip()
+
+ # add to the memes list
+ memes_ul.addContent('\n')
+ li = memes_ul.newChild(None, 'li', None)
+ memes_ul.addContent('\n')
+
+ # technorati link
+ a = li.newChild(None, 'a', None)
+ tlink = 'http://technorati.com/cosmos/search.html?url='
+ if link.startswith('http://'):
+ a.setProp('href',tlink + quote_plus(link[7:]))
+ else:
+ a.setProp('href',tlink + quote_plus(link))
+ a.setProp('title','cosmos')
+ img = a.newChild(None, 'img', None)
+ img.setProp('src','tcosm11.gif')
+
+ # main link
+ a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
+ a.setProp('href',link)
+ if (((i==0) or (updated>=weighted_links[i-1][2])) and
+ (i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
+ rank = 0
+ for j in range(0,len(weighted_links)):
+ if updated < weighted_links[j][2]: rank = rank + 1
+ if rank < len(weighted_links)/2:
+ a.setProp('class','rising')
+
+ # voters
+ ul2 = li.newChild(None, 'ul', None)
+ voters = []
+ for weight, entry, feed, title, author, mtime in all_links[link]:
+ if entry in voters: continue
+ li2 = ul2.newChild(None, 'li', None)
+ a = li2.newTextChild(None, 'a' , author)
+ a.setProp('href',entry)
+ if title: a.setProp('title',title)
+ voters.append(entry)
+
+ # add to the meme feed
+ if len(all_links[link]) > 2:
+ meme_feed.addContent('\n')
+ entry = meme_feed.newChild(None, 'entry', None)
+ meme_feed.addContent('\n')
+
+ # entry
+ tagbase = config.link().split('/')
+ if not tagbase[-1]: tagbase = tagbase[:-1]
+ tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
+ entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
+ entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
+ meme_link = entry.newTextChild(None, 'link', None)
+ meme_link.setProp('href', link)
+ entry.newTextChild(None, 'updated',
+ time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated)))
+
+ # voters
+ content = entry.newChild(None, 'content', None)
+ content.setProp('type', 'xhtml')
+ div = content.newTextChild(None, 'div', 'Spotted by:')
+ div.newNs('http://www.w3.org/1999/xhtml', None)
+ content_ul = div.newChild(None, 'ul', None)
+ for weight, entry, feed, title, author, mtime in all_links[link]:
+ li2 = content_ul.newTextChild(None, 'li', author + ": ")
+ a = li2.newTextChild(None, 'a' , title or 'untitled')
+ a.setProp('href',entry)
+
+ count = count + 1
+ if count >= 10: break
+
+log.info("Writing " + MEMES_ATOM)
+output=open(MEMES_ATOM,'w')
+output.write(feed_doc.serialize('utf-8'))
+output.close()
+
+sys.stdout.write(doc.serialize('utf-8'))
View
9 planet/config.py
@@ -352,14 +352,15 @@ def filters(section=None):
filters = []
if parser.has_option('Planet', 'filters'):
filters += parser.get('Planet', 'filters').split()
- if section and parser.has_option(section, 'filters'):
- filters += parser.get(section, 'filters').split()
if filter(section):
filters.append('regexp_sifter.py?require=' +
urllib.quote(filter(section)))
if exclude(section):
filters.append('regexp_sifter.py?exclude=' +
urllib.quote(exclude(section)))
+ for section in section and [section] or template_files():
+ if parser.has_option(section, 'filters'):
+ filters += parser.get(section, 'filters').split()
return filters
def planet_options():
@@ -382,6 +383,10 @@ def template_options(section):
""" dictionary of template specific options"""
return feed_options(section)
+def filter_options(section):
+ """ dictionary of filter specific options"""
+ return feed_options(section)
+
def write(file=sys.stdout):
""" write out an updated template """
print parser.write(file)
View
172 planet/html5lib/html5parser.py
@@ -71,35 +71,40 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
"trailingEnd": TrailingEndPhase(self, self.tree)
}
- def parse(self, stream, encoding=None, innerHTML=False):
- """Parse a HTML document into a well-formed tree
-
- stream - a filelike object or string containing the HTML to be parsed
-
- innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
- is not yet supported)
-
- The optional encoding parameter must be a string that indicates
- the encoding. If specified, that encoding will be used,
- regardless of any BOM or later declaration (such as in a meta
- element)
- """
-
+ def _parse(self, stream, innerHTML=False, container="div",
+ encoding=None):
+
self.tree.reset()
self.firstStartTag = False
self.errors = []
- self.phase = self.phases["initial"]
+ self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
+ parseMeta=innerHTML)
+
+ if innerHTML:
+ self.innerHTML = container.lower()
+
+ if self.innerHTML in ('title', 'textarea'):
+ self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
+ elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
+ self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
+ elif self.innerHTML == 'plaintext':
+ self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
+ else:
+ # contentModelFlag already is PCDATA
+ #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
+ pass
+ self.phase = self.phases["rootElement"]
+ self.phase.insertHtmlElement()
+ self.resetInsertionMode()
+ else:
+ self.innerHTML = False
+ self.phase = self.phases["initial"]
+
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
self.lastPhase = None
- # We don't actually support innerHTML yet but this should allow
- # assertations
- self.innerHTML = innerHTML
-
- self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
-
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
for token in self.tokenizer:
@@ -118,7 +123,34 @@ def parse(self, stream, encoding=None, innerHTML=False):
# When the loop finishes it's EOF
self.phase.processEOF()
+ def parse(self, stream, encoding=None):
+ """Parse a HTML document into a well-formed tree
+
+ stream - a filelike object or string containing the HTML to be parsed
+
+ The optional encoding parameter must be a string that indicates
+ the encoding. If specified, that encoding will be used,
+ regardless of any BOM or later declaration (such as in a meta
+ element)
+ """
+ self._parse(stream, innerHTML=False, encoding=encoding)
return self.tree.getDocument()
+
+ def parseFragment(self, stream, container="div", encoding=None):
+ """Parse a HTML fragment into a well-formed tree fragment
+
+ container - name of the element we're setting the innerHTML property
+ if set to None, default to 'div'
+
+ stream - a filelike object or string containing the HTML to be parsed
+
+ The optional encoding parameter must be a string that indicates
+ the encoding. If specified, that encoding will be used,
+ regardless of any BOM or later declaration (such as in a meta
+ element)
+ """
+ self._parse(stream, True, container=container, encoding=encoding)
+ return self.tree.getFragment()
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
@@ -187,28 +219,29 @@ def resetInsertionMode(self):
"frameset":"inFrameset"
}
for node in self.tree.openElements[::-1]:
+ nodeName = node.name
if node == self.tree.openElements[0]:
last = True
- if node.name not in ['td', 'th']:
+ if nodeName not in ['td', 'th']:
# XXX
assert self.innerHTML
- raise NotImplementedError
+ nodeName = self.innerHTML
# Check for conditions that should only happen in the innerHTML
# case
- if node.name in ("select", "colgroup", "head", "frameset"):
+ if nodeName in ("select", "colgroup", "head", "frameset"):
# XXX
assert self.innerHTML
- if node.name in newModes:
- self.phase = self.phases[newModes[node.name]]
+ if nodeName in newModes:
+ self.phase = self.phases[newModes[nodeName]]
break
- elif node.name == "html":
+ elif nodeName == "html":
if self.tree.headPointer is None:
self.phase = self.phases["beforeHead"]
else:
self.phase = self.phases["afterHead"]
break
elif last:
- self.phase = self.phases["body"]
+ self.phase = self.phases["inBody"]
break
class Phase(object):
@@ -434,9 +467,7 @@ def processCharacters(self, data):
self.parser.phase.processCharacters(data)
def startTagHead(self, name, attributes):
- self.tree.insertElement(name, attributes)
- self.tree.headPointer = self.tree.openElements[-1]
- self.parser.phase = self.parser.phases["inHead"]
+ self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored"))
def startTagTitle(self, name, attributes):
element = self.tree.createElement(name, attributes)
@@ -455,10 +486,11 @@ def startTagStyle(self, name, attributes):
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagScript(self, name, attributes):
+ #XXX Inner HTML case may be wrong
element = self.tree.createElement(name, attributes)
element._flags.append("parser-inserted")
- if self.tree.headPointer is not None and\
- self.parser.phase == self.parser.phases["inHead"]:
+ if (self.tree.headPointer is not None and
+ self.parser.phase == self.parser.phases["inHead"]):
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
@@ -653,8 +685,8 @@ def startTagFromHead(self, name, attributes):
def startTagBody(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (body)."))
- if len(self.tree.openElements) == 1 \
- or self.tree.openElements[1].name != "body":
+ if (len(self.tree.openElements) == 1
+ or self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML
else:
for attr, value in attributes.iteritems():
@@ -1179,6 +1211,7 @@ def endTagTable(self, name):
self.parser.resetInsertionMode()
else:
# innerHTML case
+ assert self.parser.innerHTML
self.parser.parseError()
def endTagIgnore(self, name):
@@ -1215,23 +1248,25 @@ def __init__(self, parser, tree):
])
self.endTagHandler.default = self.endTagOther
+ def ignoreEndTagCaption(self):
+ return not self.tree.elementInScope("caption", True)
+
def processCharacters(self, data):
self.parser.phases["inBody"].processCharacters(data)
def startTagTableElement(self, name, attributes):
self.parser.parseError()
+ #XXX Have to duplicate logic here to find out if the tag is ignored
+ ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag("caption")
- # XXX how do we know the tag is _always_ ignored in the innerHTML
- # case and therefore shouldn't be processed again? I'm not sure this
- # strategy makes sense...
- if not self.parser.innerHTML:
+ if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.phases["inBody"].processStartTag(name, attributes)
def endTagCaption(self, name):
- if self.tree.elementInScope(name, True):
+ if not self.ignoreEndTagCaption():
# AT this code is quite similar to endTagTable in "InTable"
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "caption":
@@ -1244,14 +1279,15 @@ def endTagCaption(self, name):
self.parser.phase = self.parser.phases["inTable"]
else:
# innerHTML case
+ assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, name):
self.parser.parseError()
+ ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag("caption")
- # XXX ...
- if not self.parser.innerHTML:
- self.parser.phase.processStartTag(name, attributes)
+ if not ignoreEndTag:
+ self.parser.phase.processEndTag(name)
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
@@ -1279,10 +1315,13 @@ def __init__(self, parser, tree):
])
self.endTagHandler.default = self.endTagOther
+ def ignoreEndTagColgroup(self):
+ return self.tree.openElements[-1].name == "html"
+
def processCharacters(self, data):
+ ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
- # XXX
- if not self.parser.innerHTML:
+ if not ignoreEndTag:
self.parser.phase.processCharacters(data)
def startTagCol(self, name ,attributes):
@@ -1290,14 +1329,15 @@ def startTagCol(self, name ,attributes):
self.tree.openElements.pop()
def startTagOther(self, name, attributes):
+ ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
- # XXX how can be sure it's always ignored?
- if not self.parser.innerHTML:
+ if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def endTagColgroup(self, name):
- if self.tree.openElements[-1].name == "html":
+ if self.ignoreEndTagColgroup():
# innerHTML case
+ assert self.parser.innerHTML
self.parser.parseError()
else:
self.tree.openElements.pop()
@@ -1308,9 +1348,9 @@ def endTagCol(self, name):
u"col has no end tag."))
def endTagOther(self, name):
+ ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
- # XXX how can be sure it's always ignored?
- if not self.parser.innerHTML:
+ if not ignoreEndTag:
self.parser.phase.processEndTag(name)
@@ -1359,9 +1399,9 @@ def startTagTableCell(self, name, attributes):
def startTagTableOther(self, name, attributes):
# XXX AT Any ideas on how to share this with endTagTable?
- if self.tree.elementInScope("tbody", True) or \
- self.tree.elementInScope("thead", True) or \
- self.tree.elementInScope("tfoot", True):
+ if (self.tree.elementInScope("tbody", True) or
+ self.tree.elementInScope("thead", True) or
+ self.tree.elementInScope("tfoot", True)):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(self.tree.openElements[-1].name)
self.parser.phase.processStartTag(name, attributes)
@@ -1382,9 +1422,9 @@ def endTagTableRowGroup(self, name):
") in the table body phase. Ignored."))
def endTagTable(self, name):
- if self.tree.elementInScope("tbody", True) or \
- self.tree.elementInScope("thead", True) or \
- self.tree.elementInScope("tfoot", True):
+ if (self.tree.elementInScope("tbody", True) or
+ self.tree.elementInScope("thead", True) or
+ self.tree.elementInScope("tfoot", True)):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(self.tree.openElements[-1].name)
self.parser.phase.processEndTag(name)
@@ -1428,6 +1468,9 @@ def clearStackToTableRowContext(self):
self.tree.openElements[-1].name + u") in the row phase."))
self.tree.openElements.pop()
+ def ignoreEndTagTr(self):
+ return not self.tree.elementInScope("tr", tableVariant=True)
+
# the rest
def processCharacters(self, data):
self.parser.phases["inTable"].processCharacters(data)
@@ -1439,28 +1482,31 @@ def startTagTableCell(self, name, attributes):
self.tree.activeFormattingElements.append(Marker)
def startTagTableOther(self, name, attributes):
+ ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr("tr")
# XXX how are we sure it's always ignored in the innerHTML case?
- if not self.parser.innerHTML:
+ if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.phases["inTable"].processStartTag(name, attributes)
def endTagTr(self, name):
- if self.tree.elementInScope("tr", True):
+ if not self.ignoreEndTagTr():
self.clearStackToTableRowContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTableBody"]
else:
# innerHTML case
+ assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, name):
+ ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr("tr")
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
- if not self.parser.innerHTML:
+ if not ignoreEndTag:
self.parser.phase.processEndTag(name)
def endTagTableRowGroup(self, name):
@@ -1628,7 +1674,7 @@ def endTagOptgroup(self, name):
u"select phase. Ignored."))
def endTagSelect(self, name):
- if self.tree.elementInScope(name, True):
+ if self.tree.elementInScope("select", True):
node = self.tree.openElements.pop()
while node.name != "select":
node = self.tree.openElements.pop()
@@ -1641,7 +1687,7 @@ def endTagTableElements(self, name):
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
") in the select phase."))
if self.tree.elementInScope(name, True):
- self.endTagSelect()
+ self.endTagSelect("select")
self.parser.phase.processEndTag(name)
def endTagOther(self, name):
@@ -1736,8 +1782,8 @@ def endTagFrameset(self, name):
u"in the frameset phase (innerHTML)."))
else:
self.tree.openElements.pop()
- if not self.parser.innerHTML and\
- self.tree.openElements[-1].name != "frameset":
+ if (not self.parser.innerHTML and
+ self.tree.openElements[-1].name != "frameset"):
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
self.parser.phase = self.parser.phases["afterFrameset"]
View
15 planet/html5lib/inputstream.py
@@ -14,7 +14,7 @@ class HTMLInputStream(object):
"""
- def __init__(self, source, encoding=None, chardet=True):
+ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -26,6 +26,8 @@ def __init__(self, source, encoding=None, chardet=True):
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
+
+ parseMeta - Look for a <meta> element containing encoding information
"""
# List of where new lines occur
@@ -41,12 +43,9 @@ def __init__(self, source, encoding=None, chardet=True):
#Encoding to use if no other information can be found
self.defaultEncoding = "windows-1252"
- #Autodetect encoding if no other information can be found?
- self.chardet = chardet
-
#Detect encoding iff no explicit "transport level" encoding is supplied
if encoding is None or not isValidEncoding(encoding):
- encoding = self.detectEncoding()
+ encoding = self.detectEncoding(parseMeta, chardet)
self.charEncoding = encoding
# Read bytes from stream decoding them into Unicode
@@ -79,17 +78,17 @@ def openStream(self, source):
stream = cStringIO.StringIO(str(source))
return stream
- def detectEncoding(self):
+ def detectEncoding(self, parseMeta=True, chardet=True):
#First look for a BOM
#This will also read past the BOM if present
encoding = self.detectBOM()
#If there is no BOM need to look for meta elements with encoding
#information
- if encoding is None:
+ if encoding is None and parseMeta:
encoding = self.detectEncodingMeta()
#Guess with chardet, if avaliable
- if encoding is None and self.chardet:
+ if encoding is None and chardet:
try:
import chardet
buffer = self.rawStream.read()
View
46 planet/html5lib/tokenizer.py
@@ -32,8 +32,8 @@ class HTMLTokenizer(object):
# XXX need to fix documentation
- def __init__(self, stream, encoding=None):
- self.stream = HTMLInputStream(stream, encoding)
+ def __init__(self, stream, encoding=None, parseMeta=True):
+ self.stream = HTMLInputStream(stream, encoding, parseMeta)
self.states = {
"data":self.dataState,
@@ -338,31 +338,33 @@ def tagOpenState(self):
self.state = self.states["closeTagOpen"]
else:
self.tokenQueue.append({"type": "Characters", "data": u"<"})
- self.stream.queue.append(data)
+ self.stream.queue.insert(0, data)
self.state = self.states["data"]
return True
def closeTagOpenState(self):
- if self.contentModelFlag in (contentModelFlags["RCDATA"],\
- contentModelFlags["CDATA"]):
- charStack = []
-
- # So far we know that "</" has been consumed. We now need to know
- # whether the next few characters match the name of last emitted
- # start tag which also happens to be the currentToken. We also need
- # to have the character directly after the characters that could
- # match the start tag name.
- for x in xrange(len(self.currentToken["name"]) + 1):
- charStack.append(self.stream.char())
- # Make sure we don't get hit by EOF
- if charStack[-1] == EOF:
- break
-
- # Since this is just for checking. We put the characters back on
- # the stack.
- self.stream.queue.extend(charStack)
+ if (self.contentModelFlag in (contentModelFlags["RCDATA"],
+ contentModelFlags["CDATA"])):
+ if self.currentToken:
+ charStack = []
+
+ # So far we know that "</" has been consumed. We now need to know
+ # whether the next few characters match the name of last emitted
+ # start tag which also happens to be the currentToken. We also need
+ # to have the character directly after the characters that could
+ # match the start tag name.
+ for x in xrange(len(self.currentToken["name"]) + 1):
+ charStack.append(self.stream.char())
+ # Make sure we don't get hit by EOF
+ if charStack[-1] == EOF:
+ break
+
+ # Since this is just for checking. We put the characters back on
+ # the stack.
+ self.stream.queue.extend(charStack)
- if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
+ if self.currentToken \
+ and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
and charStack[-1] in (spaceCharacters |
frozenset((u">", u"/", u"<", EOF))):
# Because the characters are correct we can safely switch to
View
11 planet/html5lib/treebuilders/_base.py
@@ -108,6 +108,9 @@ class TreeBuilder(object):
#The class to use for creating doctypes
doctypeClass = None
+
+ #Fragment class
+ fragmentClass = None
def __init__(self):
self.reset()
@@ -294,7 +297,6 @@ def getTableMisnestedNodePosition(self):
fosterParent = self.openElements[
self.openElements.index(lastTable) - 1]
else:
- assert self.innerHTML
fosterParent = self.openElements[0]
return fosterParent, insertBefore
@@ -310,6 +312,13 @@ def generateImpliedEndTags(self, exclude=None):
def getDocument(self):
"Return the final tree"
return self.document
+
+ def getFragment(self):
+ "Return the final fragment"
+ #assert self.innerHTML
+ fragment = self.fragmentClass()
+ self.openElements[0].reparentChildren(fragment)
+ return fragment
def testSerializer(self, node):
"""Serialize the subtree of node in the format required by unit tests
View
43 planet/html5lib/treebuilders/dom.py
@@ -1,6 +1,8 @@
import _base
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
+from xml.sax.saxutils import escape
+from constants import voidElements
import re
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@@ -87,6 +89,9 @@ def elementClass(self, name):
def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data))
+
+ def fragmentClass(self):
+ return NodeBuilder(self.dom.createDocumentFragment())
def appendChild(self, node):
self.dom.appendChild(node.element)
@@ -96,6 +101,9 @@ def testSerializer(self, element):
def getDocument(self):
return self.dom
+
+ def getFragment(self):
+ return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data=illegal_xml_chars.sub(u'\uFFFD',data)
@@ -118,7 +126,9 @@ def serializeElement(element, indent=0):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
elif element.nodeType == Node.DOCUMENT_NODE:
- rv.append("#document")
+ rv.append("#document")
+ elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+ rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE:
@@ -135,6 +145,32 @@ def serializeElement(element, indent=0):
return "\n".join(rv)
+class HTMLSerializer(object):
+ def serialize(self, node):
+ rv = self.serializeNode(node)
+ for child in node.childNodes:
+ rv += self.serialize(child)
+ if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
+ rv += "</%s>\n"%node.nodeName
+ return rv
+
+ def serializeNode(self, node):
+ if node.nodeType == Node.TEXT_NODE:
+ rv = node.nodeValue
+ elif node.nodeType == Node.ELEMENT_NODE:
+ rv = "<%s"%node.nodeName
+ if node.hasAttributes():
+ rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
+ node.attributes.items()])
+ rv += ">"
+ elif node.nodeType == Node.COMMENT_NODE:
+ rv = "<!-- %s -->" % escape(node.nodeValue)
+ elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
+ rv = "<!DOCTYPE %s>" % node.name
+ else:
+ rv = ""
+ return rv
+
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
@@ -179,7 +215,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
- handler.endDocument()
+ handler.endDocument()
+
+ elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+ for child in node.childNodes: dom2sax(child, handler, nsmap)
else:
# ATTRIBUTE_NODE
View
8 planet/html5lib/treebuilders/etreefull.py
@@ -129,6 +129,10 @@ class Document(Element):
def __init__(self):
Element.__init__(self, Document)
+class DocumentFragment(Element):
+ def __init__(self):
+ Element.__init__(self, DocumentFragment)
+
def testSerializer(element):
rv = []
finalText = None
@@ -211,9 +215,13 @@ class TreeBuilder(_base.TreeBuilder):
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
+ fragmentClass = DocumentFragment
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.document._element
+
+ def getFragment(self):
+ return _base.TreeBuilder.getFragment(self)._element
View
47 planet/html5lib/treebuilders/simpletree.py
@@ -4,6 +4,7 @@
# Really crappy basic implementation of a DOM-core like thing
class Node(_base.Node):
+ type = -1
def __init__(self, name):
self.name = name
self.parent = None
@@ -11,15 +12,18 @@ def __init__(self, name):
self.childNodes = []
self._flags = []
+ def __iter__(self):
+ for node in self.childNodes:
+ yield node
+ for item in node:
+ yield item
+
def __unicode__(self):
return self.name
def toxml(self):
raise NotImplementedError
- def __repr__(self):
- return "<%s %s>" % (self.__class__, self.name)
-
def printTree(self, indent=0):
tree = '\n|%s%s' % (' '* indent, unicode(self))
for child in self.childNodes:
@@ -69,6 +73,7 @@ def hasContent(self):
return bool(self.childNodes)
class Document(Node):
+ type = 1
def __init__(self):
Node.__init__(self, None)
@@ -93,7 +98,13 @@ def printTree(self):
tree += child.printTree(2)
return tree
+class DocumentFragment(Document):
+ type = 2
+ def __unicode__(self):
+ return "#document-fragment"
+
class DocumentType(Node):
+ type = 3
def __init__(self, name):
Node.__init__(self, name)
@@ -106,6 +117,7 @@ def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
class TextNode(Node):
+ type = 4
def __init__(self, value):
Node.__init__(self, None)
self.value = value
@@ -119,6 +131,7 @@ def toxml(self):
hilite = toxml
class Element(Node):
+ type = 5
def __init__(self, name):
Node.__init__(self, name)
self.attributes = {}
@@ -164,6 +177,7 @@ def printTree(self, indent):
return tree
class CommentNode(Node):
+ type = 6
def __init__(self, data):
Node.__init__(self, None)
self.data = data
@@ -177,11 +191,38 @@ def toxml(self):
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
+class HTMLSerializer(object):
+ def serialize(self, node):
+ rv = self.serializeNode(node)
+ for child in node.childNodes:
+ rv += self.serialize(child)
+ if node.type == Element.type and node.name not in voidElements:
+ rv += "</%s>\n"%node.name
+ return rv
+
+ def serializeNode(self, node):
+ if node.type == TextNode.type:
+ rv = node.value
+ elif node.type == Element.type:
+ rv = "<%s"%node.name
+ if node.attributes:
+ rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
+ node.attributes.iteritems()])
+ rv += ">"
+ elif node.type == CommentNode.type:
+ rv = "<!-- %s -->" % escape(node.data)
+ elif node.type == DocumentType.type:
+ rv = "<!DOCTYPE %s>" % node.name
+ else:
+ rv = ""
+ return rv
+
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = CommentNode
+ fragmentClass = DocumentFragment
def testSerializer(self, node):
return node.printTree()
View
1  planet/shell/__init__.py
@@ -54,6 +54,7 @@ def run(template_file, doc, mode='template'):
# Execute the shell module
options = planet.config.template_options(template_file)
+ if module_name == 'plugin': options['__file__'] = template_file
options.update(extra_options)
log.debug("Processing %s %s using %s", mode,
os.path.realpath(template_resolved), module_name)
View
64 planet/shell/plugin.py
@@ -0,0 +1,64 @@
+import os, sys, imp
+from StringIO import StringIO
+
+def run(script, doc, output_file=None, options={}):
+ """ process an Python script using imp """
+ save_sys = (sys.stdin, sys.stdout, sys.stderr, sys.argv)
+ plugin_stdout = StringIO()
+ plugin_stderr = StringIO()
+
+ try:
+ # redirect stdin
+ sys.stdin = StringIO(doc)
+
+ # redirect stdout
+ if output_file:
+ sys.stdout = open(output_file, 'w')
+ else:
+ sys.stdout = plugin_stdout
+
+ # redirect stderr
+ sys.stderr = plugin_stderr
+
+ # determine __file__ value
+ if options.has_key("__file__"):
+ plugin_file = options["__file__"]
+ del options["__file__"]
+ else:
+ plugin_file = script
+
+ # set sys.argv
+ options = sum([['--'+key, value] for key,value in options.items()], [])
+ sys.argv = [plugin_file] + options
+
+ # import script
+ handle = open(script, 'r')
+ cwd = os.getcwd()
+ try:
+ try:
+ try:
+ description=('.plugin', 'rb', imp.PY_SOURCE)
+ imp.load_module('__main__',handle,plugin_file,description)
+ except SystemExit,e:
+ if e.code: log.error('%s exit rc=%d',(plugin_file,e.code))
+ except Exception, e:
+ import traceback
+ type, value, tb = sys.exc_info()
+ plugin_stderr.write(''.join(
+ traceback.format_exception_only(type,value) +
+ traceback.format_tb(tb)))
+ finally:
+ handle.close()
+ if cwd != os.getcwd(): os.chdir(cwd)
+
+ finally:
+ # restore system state
+ sys.stdin, sys.stdout, sys.stderr, sys.argv = save_sys
+
+ # log anything sent to stderr
+ if plugin_stderr.getvalue():
+ import planet
+ planet.logger.error(plugin_stderr.getvalue())
+
+ # return stdout
+ return plugin_stdout.getvalue()
View
1  planet/spider.py
@@ -329,7 +329,6 @@ def httpThread(thread_index, input_queue, output_queue, log):
def spiderPlanet(only_if_new = False):
""" Spider (fetch) an entire planet """
- # log = planet.getLogger(config.log_level(),config.log_format())
log = planet.getLogger(config.log_level(),config.log_format())
global index
View
23 runtests.py
@@ -18,12 +18,23 @@ def fullmodname(path):
if sys.path[0]: os.chdir(sys.path[0])
sys.path[0] = os.getcwd()
+# determine verbosity
+verbosity = 1
+for arg,value in (('-q',0),('--quiet',0),('-v',2),('--verbose',2)):
+ if arg in sys.argv:
+ verbosity = value
+ sys.argv.remove(arg)
+
# find all of the planet test modules
-modules = map(fullmodname, glob.glob(os.path.join('tests', 'test_*.py')))
+modules = []
+for pattern in sys.argv[1:] or ['test_*.py']:
+ modules += map(fullmodname, glob.glob(os.path.join('tests', pattern)))
-# enable warnings
+# enable logging
import planet
-planet.getLogger("WARNING",None)
+if verbosity == 0: planet.getLogger("FATAL",None)
+if verbosity == 1: planet.getLogger("WARNING",None)
+if verbosity == 2: planet.getLogger("DEBUG",None)
# load all of the tests into a suite
try:
@@ -33,11 +44,5 @@ def fullmodname(path):
for module in modules: __import__(module)
raise
-verbosity = 1
-if "-q" in sys.argv or '--quiet' in sys.argv:
- verbosity = 0
-if "-v" in sys.argv or '--verbose' in sys.argv:
- verbosity = 2
-
# run test suite
unittest.TextTestRunner(verbosity=verbosity).run(suite)
View
29 tests/data/apply/config-mememe.ini
@@ -0,0 +1,29 @@
+[Planet]
+output_theme = classic_fancy
+output_dir = tests/work/apply
+name = test planet
+cache_directory = tests/work/spider/cache
+
+bill_of_materials:
+ images/#{face}
+
+[index.html.tmpl]
+filters:
+ html2xhtml.plugin
+ mememe.plugin
+
+[mememe.plugin]
+sidebar = //*[@class='sidebar']
+
+[tests/data/spider/testfeed0.atom]
+name = not found
+
+[tests/data/spider/testfeed1b.atom]
+name = one
+face = jdub.png
+
+[tests/data/spider/testfeed2.atom]
+name = two
+
+[tests/data/spider/testfeed3.rss]
+name = three
View
16 tests/test_apply.py
@@ -21,8 +21,7 @@ def setUp(self):
os.makedirs(workdir)
def tearDown(self):
- shutil.rmtree(workdir)
- os.removedirs(os.path.split(workdir)[0])
+ shutil.rmtree(os.path.split(workdir)[0])
def test_apply_asf(self):
config.load(configfile % 'asf')
@@ -65,7 +64,20 @@ def test_apply_filter_html(self):
output = open(os.path.join(workdir, 'index.html4')).read()
self.assertTrue(output.find('/>')<0)
+ def test_apply_filter_mememe(self):
+ config.load(configfile % 'mememe')
+ self.apply_fancy()
+
+ output = open(os.path.join(workdir, 'index.html')).read()
+ self.assertTrue(output.find('<div class="sidebar"><h2>Memes <a href="memes.atom">')>=0)
+
def apply_fancy(self):
+ # drop slow templates unrelated to test at hand
+ templates = config.parser.get('Planet','template_files').split()
+ templates.remove('rss10.xml.tmpl')
+ templates.remove('rss20.xml.tmpl')
+ config.parser.set('Planet','template_files',' '.join(templates))
+
splice.apply(self.feeddata)
# verify that selected files are there
Please sign in to comment.
Something went wrong with that request. Please try again.