from xml.sax import ContentHandler, make_parser, SAXParseException
from xml.sax.xmlreader import InputSource
from sgmllib import SGMLParser
from cStringIO import StringIO
from ConfigParser import ConfigParser
from htmlentitydefs import entitydefs
import re
# input = opml, output = ConfigParser
def opml2config(opml, config=None):
if hasattr(opml, 'read'):
opml =
if not config:
config = ConfigParser()
opmlParser = OpmlParser(config)
# try SAX
source = InputSource()
parser = make_parser()
except SAXParseException:
# try as SGML
return config
# Parse OPML via either SAX or SGML
class OpmlParser(ContentHandler,SGMLParser):
entities = re.compile('&(#?\w+);')
def __init__(self, config):
self.config = config
def startElement(self, name, attrs):
# we are only looking for data in 'outline' nodes.
if name != 'outline': return
# A type of 'rss' is meant to be used generically to indicate that
# this is an entry in a subscription list, but some leave this
# attribute off, and others have placed 'atom' in here
if attrs.has_key('type'):
if attrs['type'] == 'link' and not attrs.has_key('url'):
# Auto-correct WordPress link manager OPML files
attrs = dict(attrs.items())
attrs['type'] = 'rss'
if attrs['type'].lower() not in['rss','atom']: return
# The feed itself is supposed to be in an attribute named 'xmlUrl'
# (note the camel casing), but this has proven to be problematic,
# with the most common misspelling being in all lower-case
if not attrs.has_key('xmlUrl') or not attrs['xmlUrl'].strip():
for attribute in attrs.keys():
if attribute.lower() == 'xmlurl' and attrs[attribute].strip():
attrs = dict(attrs.items())
attrs['xmlUrl'] = attrs[attribute]
# the text attribute is nominally required in OPML, but this
# data is often found in a title attribute instead
if not attrs.has_key('text') or not attrs['text'].strip():
if not attrs.has_key('title') or not attrs['title'].strip(): return
attrs = dict(attrs.items())
attrs['text'] = attrs['title']
# if we get this far, we either have a valid subscription list entry,
# or one with a correctable error. Add it to the configuration, if
# it is not already there.
xmlUrl = attrs['xmlUrl']
if not self.config.has_section(xmlUrl):
self.config.set(xmlUrl, 'name', self.unescape(attrs['text']))
def unescape(self, text):
parsed = self.entities.split(text)
for i in range(1,len(parsed),2):
if parsed[i] in entitydefs.keys():
# named entities
if match:
# numeric entities
if parsed[i].startswith('#'):
if parsed[i].startswith('#x'):
return u''.join(parsed).encode('utf-8')
def unknown_starttag(self, name, attrs):
attrs = dict(attrs)
for attribute in attrs:
attrs[attribute] = attrs[attribute].decode('utf-8')
work = attrs[attribute].decode('iso-8859-1')
work = u''.join([c in cp1252 and cp1252[c] or c for c in work])
attrs[attribute] = work
self.startElement(name, attrs)
cp1252 = {
unichr(128): unichr(8364), # euro sign
unichr(130): unichr(8218), # single low-9 quotation mark
unichr(131): unichr( 402), # latin small letter f with hook
unichr(132): unichr(8222), # double low-9 quotation mark
unichr(133): unichr(8230), # horizontal ellipsis
unichr(134): unichr(8224), # dagger
unichr(135): unichr(8225), # double dagger
unichr(136): unichr( 710), # modifier letter circumflex accent
unichr(137): unichr(8240), # per mille sign
unichr(138): unichr( 352), # latin capital letter s with caron
unichr(139): unichr(8249), # single left-pointing angle quotation mark
unichr(140): unichr( 338), # latin capital ligature oe
unichr(142): unichr( 381), # latin capital letter z with caron
unichr(145): unichr(8216), # left single quotation mark
unichr(146): unichr(8217), # right single quotation mark
unichr(147): unichr(8220), # left double quotation mark
unichr(148): unichr(8221), # right double quotation mark
unichr(149): unichr(8226), # bullet
unichr(150): unichr(8211), # en dash
unichr(151): unichr(8212), # em dash
unichr(152): unichr( 732), # small tilde
unichr(153): unichr(8482), # trade mark sign
unichr(154): unichr( 353), # latin small letter s with caron
unichr(155): unichr(8250), # single right-pointing angle quotation mark
unichr(156): unichr( 339), # latin small ligature oe
unichr(158): unichr( 382), # latin small letter z with caron
unichr(159): unichr( 376)} # latin capital letter y with diaeresis
if __name__ == "__main__":
# small main program which converts OPML into config.ini format
import sys, urllib
config = ConfigParser()
for opml in sys.argv[1:]:
opml2config(urllib.urlopen(opml), config)
