Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 155 lines (133 sloc) 6.095 kb
from xml.sax import ContentHandler, make_parser, SAXParseException
from xml.sax.xmlreader import InputSource
from sgmllib import SGMLParser
from cStringIO import StringIO
from ConfigParser import ConfigParser
from htmlentitydefs import entitydefs
import re
# input = opml, output = ConfigParser
def opml2config(opml, config=None):
if hasattr(opml, 'read'):
opml = opml.read()
if not config:
config = ConfigParser()
opmlParser = OpmlParser(config)
try:
# try SAX
source = InputSource()
source.setByteStream(StringIO(opml))
parser = make_parser()
parser.setContentHandler(opmlParser)
parser.parse(source)
except SAXParseException:
# try as SGML
opmlParser.feed(opml)
return config
# Parse OPML via either SAX or SGML
class OpmlParser(ContentHandler,SGMLParser):
entities = re.compile('&(#?\w+);')
def __init__(self, config):
ContentHandler.__init__(self)
SGMLParser.__init__(self)
self.config = config
def startElement(self, name, attrs):
# we are only looking for data in 'outline' nodes.
if name != 'outline': return
# A type of 'rss' is meant to be used generically to indicate that
# this is an entry in a subscription list, but some leave this
# attribute off, and others have placed 'atom' in here
if attrs.has_key('type'):
if attrs['type'] == 'link' and not attrs.has_key('url'):
# Auto-correct WordPress link manager OPML files
attrs = dict(attrs.items())
attrs['type'] = 'rss'
if attrs['type'].lower() not in['rss','atom']: return
# The feed itself is supposed to be in an attribute named 'xmlUrl'
# (note the camel casing), but this has proven to be problematic,
# with the most common misspelling being in all lower-case
if not attrs.has_key('xmlUrl') or not attrs['xmlUrl'].strip():
for attribute in attrs.keys():
if attribute.lower() == 'xmlurl' and attrs[attribute].strip():
attrs = dict(attrs.items())
attrs['xmlUrl'] = attrs[attribute]
break
else:
return
# the text attribute is nominally required in OPML, but this
# data is often found in a title attribute instead
if not attrs.has_key('text') or not attrs['text'].strip():
if not attrs.has_key('title') or not attrs['title'].strip(): return
attrs = dict(attrs.items())
attrs['text'] = attrs['title']
# if we get this far, we either have a valid subscription list entry,
# or one with a correctable error. Add it to the configuration, if
# it is not already there.
xmlUrl = attrs['xmlUrl']
if not self.config.has_section(xmlUrl):
self.config.add_section(xmlUrl)
self.config.set(xmlUrl, 'name', self.unescape(attrs['text']))
def unescape(self, text):
parsed = self.entities.split(text)
for i in range(1,len(parsed),2):
if parsed[i] in entitydefs.keys():
# named entities
codepoint=entitydefs[parsed[i]]
match=self.entities.match(codepoint)
if match:
parsed[i]=match.group(1)
else:
parsed[i]=unichr(ord(codepoint))
# numeric entities
if parsed[i].startswith('#'):
if parsed[i].startswith('#x'):
parsed[i]=unichr(int(parsed[i][2:],16))
else:
parsed[i]=unichr(int(parsed[i][1:]))
return u''.join(parsed).encode('utf-8')
# SGML => SAX
def unknown_starttag(self, name, attrs):
attrs = dict(attrs)
for attribute in attrs:
try:
attrs[attribute] = attrs[attribute].decode('utf-8')
except:
work = attrs[attribute].decode('iso-8859-1')
work = u''.join([c in cp1252 and cp1252[c] or c for c in work])
attrs[attribute] = work
self.startElement(name, attrs)
# http://www.intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
cp1252 = {
unichr(128): unichr(8364), # euro sign
unichr(130): unichr(8218), # single low-9 quotation mark
unichr(131): unichr( 402), # latin small letter f with hook
unichr(132): unichr(8222), # double low-9 quotation mark
unichr(133): unichr(8230), # horizontal ellipsis
unichr(134): unichr(8224), # dagger
unichr(135): unichr(8225), # double dagger
unichr(136): unichr( 710), # modifier letter circumflex accent
unichr(137): unichr(8240), # per mille sign
unichr(138): unichr( 352), # latin capital letter s with caron
unichr(139): unichr(8249), # single left-pointing angle quotation mark
unichr(140): unichr( 338), # latin capital ligature oe
unichr(142): unichr( 381), # latin capital letter z with caron
unichr(145): unichr(8216), # left single quotation mark
unichr(146): unichr(8217), # right single quotation mark
unichr(147): unichr(8220), # left double quotation mark
unichr(148): unichr(8221), # right double quotation mark
unichr(149): unichr(8226), # bullet
unichr(150): unichr(8211), # en dash
unichr(151): unichr(8212), # em dash
unichr(152): unichr( 732), # small tilde
unichr(153): unichr(8482), # trade mark sign
unichr(154): unichr( 353), # latin small letter s with caron
unichr(155): unichr(8250), # single right-pointing angle quotation mark
unichr(156): unichr( 339), # latin small ligature oe
unichr(158): unichr( 382), # latin small letter z with caron
unichr(159): unichr( 376)} # latin capital letter y with diaeresis
if __name__ == "__main__":
# small main program which converts OPML into config.ini format
import sys, urllib
config = ConfigParser()
for opml in sys.argv[1:]:
opml2config(urllib.urlopen(opml), config)
config.write(sys.stdout)
Jump to Line
Something went wrong with that request. Please try again.