Skip to content


Subversion checkout URL

You can clone with
Download ZIP
Fetching contributors…
Cannot retrieve contributors at this time
executable file 155 lines (133 sloc) 5.95 KB
from xml.sax import ContentHandler, make_parser, SAXParseException
from xml.sax.xmlreader import InputSource
from sgmllib import SGMLParser
from cStringIO import StringIO
from ConfigParser import ConfigParser
from htmlentitydefs import entitydefs
import re
# input = opml, output = ConfigParser
def opml2config(opml, config=None):
if hasattr(opml, 'read'):
opml =
if not config:
config = ConfigParser()
opmlParser = OpmlParser(config)
# try SAX
source = InputSource()
parser = make_parser()
except SAXParseException:
# try as SGML
return config
# Parse OPML via either SAX or SGML
class OpmlParser(ContentHandler,SGMLParser):
entities = re.compile('&(#?\w+);')
def __init__(self, config):
self.config = config
def startElement(self, name, attrs):
# we are only looking for data in 'outline' nodes.
if name != 'outline': return
# A type of 'rss' is meant to be used generically to indicate that
# this is an entry in a subscription list, but some leave this
# attribute off, and others have placed 'atom' in here
if attrs.has_key('type'):
if attrs['type'] == 'link' and not attrs.has_key('url'):
# Auto-correct WordPress link manager OPML files
attrs = dict(attrs.items())
attrs['type'] = 'rss'
if attrs['type'].lower() not in['rss','atom']: return
# The feed itself is supposed to be in an attribute named 'xmlUrl'
# (note the camel casing), but this has proven to be problematic,
# with the most common misspelling being in all lower-case
if not attrs.has_key('xmlUrl') or not attrs['xmlUrl'].strip():
for attribute in attrs.keys():
if attribute.lower() == 'xmlurl' and attrs[attribute].strip():
attrs = dict(attrs.items())
attrs['xmlUrl'] = attrs[attribute]
# the text attribute is nominally required in OPML, but this
# data is often found in a title attribute instead
if not attrs.has_key('text') or not attrs['text'].strip():
if not attrs.has_key('title') or not attrs['title'].strip(): return
attrs = dict(attrs.items())
attrs['text'] = attrs['title']
# if we get this far, we either have a valid subscription list entry,
# or one with a correctable error. Add it to the configuration, if
# it is not already there.
xmlUrl = attrs['xmlUrl']
if not self.config.has_section(xmlUrl):
self.config.set(xmlUrl, 'name', self.unescape(attrs['text']))
def unescape(self, text):
parsed = self.entities.split(text)
for i in range(1,len(parsed),2):
if parsed[i] in entitydefs.keys():
# named entities
if match:
# numeric entities
if parsed[i].startswith('#'):
if parsed[i].startswith('#x'):
return u''.join(parsed).encode('utf-8')
def unknown_starttag(self, name, attrs):
attrs = dict(attrs)
for attribute in attrs:
attrs[attribute] = attrs[attribute].decode('utf-8')
work = attrs[attribute].decode('iso-8859-1')
work = u''.join([c in cp1252 and cp1252[c] or c for c in work])
attrs[attribute] = work
self.startElement(name, attrs)
cp1252 = {
unichr(128): unichr(8364), # euro sign
unichr(130): unichr(8218), # single low-9 quotation mark
unichr(131): unichr( 402), # latin small letter f with hook
unichr(132): unichr(8222), # double low-9 quotation mark
unichr(133): unichr(8230), # horizontal ellipsis
unichr(134): unichr(8224), # dagger
unichr(135): unichr(8225), # double dagger
unichr(136): unichr( 710), # modifier letter circumflex accent
unichr(137): unichr(8240), # per mille sign
unichr(138): unichr( 352), # latin capital letter s with caron
unichr(139): unichr(8249), # single left-pointing angle quotation mark
unichr(140): unichr( 338), # latin capital ligature oe
unichr(142): unichr( 381), # latin capital letter z with caron
unichr(145): unichr(8216), # left single quotation mark
unichr(146): unichr(8217), # right single quotation mark
unichr(147): unichr(8220), # left double quotation mark
unichr(148): unichr(8221), # right double quotation mark
unichr(149): unichr(8226), # bullet
unichr(150): unichr(8211), # en dash
unichr(151): unichr(8212), # em dash
unichr(152): unichr( 732), # small tilde
unichr(153): unichr(8482), # trade mark sign
unichr(154): unichr( 353), # latin small letter s with caron
unichr(155): unichr(8250), # single right-pointing angle quotation mark
unichr(156): unichr( 339), # latin small ligature oe
unichr(158): unichr( 382), # latin small letter z with caron
unichr(159): unichr( 376)} # latin capital letter y with diaeresis
if __name__ == "__main__":
# small main program which converts OPML into config.ini format
import sys, urllib
config = ConfigParser()
for opml in sys.argv[1:]:
opml2config(urllib.urlopen(opml), config)
Jump to Line
Something went wrong with that request. Please try again.