Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 155 lines (133 sloc) 6.095 kb
87f99e8 Reading lists
Sam Ruby authored
1 from xml.sax import ContentHandler, make_parser, SAXParseException
2 from xml.sax.xmlreader import InputSource
3 from sgmllib import SGMLParser
4 from cStringIO import StringIO
5 from ConfigParser import ConfigParser
6 from htmlentitydefs import entitydefs
7 import re
8
9 # input = opml, output = ConfigParser
10 def opml2config(opml, config=None):
11
12 if hasattr(opml, 'read'):
13 opml = opml.read()
14
15 if not config:
16 config = ConfigParser()
17
18 opmlParser = OpmlParser(config)
19
20 try:
21 # try SAX
22 source = InputSource()
23 source.setByteStream(StringIO(opml))
24 parser = make_parser()
25 parser.setContentHandler(opmlParser)
26 parser.parse(source)
27 except SAXParseException:
28 # try as SGML
29 opmlParser.feed(opml)
30
31 return config
32
33 # Parse OPML via either SAX or SGML
34 class OpmlParser(ContentHandler,SGMLParser):
35 entities = re.compile('&(#?\w+);')
36
37 def __init__(self, config):
38 ContentHandler.__init__(self)
39 SGMLParser.__init__(self)
40 self.config = config
41
42 def startElement(self, name, attrs):
43
44 # we are only looking for data in 'outline' nodes.
45 if name != 'outline': return
46
47 # A type of 'rss' is meant to be used generically to indicate that
48 # this is an entry in a subscription list, but some leave this
49 # attribute off, and others have placed 'atom' in here
50 if attrs.has_key('type'):
6ae4fad Sam Ruby Handle WordPress Link Manager dialect of OPML
authored
51 if attrs['type'] == 'link' and not attrs.has_key('url'):
52 # Auto-correct WordPress link manager OPML files
53 attrs = dict(attrs.items())
54 attrs['type'] = 'rss'
87f99e8 Reading lists
Sam Ruby authored
55 if attrs['type'].lower() not in['rss','atom']: return
56
57 # The feed itself is supposed to be in an attribute named 'xmlUrl'
58 # (note the camel casing), but this has proven to be problematic,
59 # with the most common misspelling being in all lower-case
60 if not attrs.has_key('xmlUrl') or not attrs['xmlUrl'].strip():
61 for attribute in attrs.keys():
62 if attribute.lower() == 'xmlurl' and attrs[attribute].strip():
63 attrs = dict(attrs.items())
64 attrs['xmlUrl'] = attrs[attribute]
65 break
66 else:
67 return
68
69 # the text attribute is nominally required in OPML, but this
70 # data is often found in a title attribute instead
71 if not attrs.has_key('text') or not attrs['text'].strip():
72 if not attrs.has_key('title') or not attrs['title'].strip(): return
73 attrs = dict(attrs.items())
74 attrs['text'] = attrs['title']
75
76 # if we get this far, we either have a valid subscription list entry,
77 # or one with a correctable error. Add it to the configuration, if
78 # it is not already there.
79 xmlUrl = attrs['xmlUrl']
80 if not self.config.has_section(xmlUrl):
81 self.config.add_section(xmlUrl)
82 self.config.set(xmlUrl, 'name', self.unescape(attrs['text']))
83
84 def unescape(self, text):
85 parsed = self.entities.split(text)
86
87 for i in range(1,len(parsed),2):
88
89 if parsed[i] in entitydefs.keys():
90 # named entities
91 codepoint=entitydefs[parsed[i]]
92 match=self.entities.match(codepoint)
93 if match:
94 parsed[i]=match.group(1)
95 else:
96 parsed[i]=unichr(ord(codepoint))
97
98 # numeric entities
99 if parsed[i].startswith('#'):
100 if parsed[i].startswith('#x'):
101 parsed[i]=unichr(int(parsed[i][2:],16))
102 else:
103 parsed[i]=unichr(int(parsed[i][1:]))
104
105 return u''.join(parsed).encode('utf-8')
106 # SGML => SAX
107 def unknown_starttag(self, name, attrs):
108 attrs = dict(attrs)
109 for attribute in attrs:
110 try:
111 attrs[attribute] = attrs[attribute].decode('utf-8')
112 except:
113 work = attrs[attribute].decode('iso-8859-1')
114 work = u''.join([c in cp1252 and cp1252[c] or c for c in work])
115 attrs[attribute] = work
116 self.startElement(name, attrs)
117
118 # http://www.intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
119 cp1252 = {
120 unichr(128): unichr(8364), # euro sign
121 unichr(130): unichr(8218), # single low-9 quotation mark
122 unichr(131): unichr( 402), # latin small letter f with hook
123 unichr(132): unichr(8222), # double low-9 quotation mark
124 unichr(133): unichr(8230), # horizontal ellipsis
125 unichr(134): unichr(8224), # dagger
126 unichr(135): unichr(8225), # double dagger
127 unichr(136): unichr( 710), # modifier letter circumflex accent
128 unichr(137): unichr(8240), # per mille sign
129 unichr(138): unichr( 352), # latin capital letter s with caron
130 unichr(139): unichr(8249), # single left-pointing angle quotation mark
131 unichr(140): unichr( 338), # latin capital ligature oe
132 unichr(142): unichr( 381), # latin capital letter z with caron
133 unichr(145): unichr(8216), # left single quotation mark
134 unichr(146): unichr(8217), # right single quotation mark
135 unichr(147): unichr(8220), # left double quotation mark
136 unichr(148): unichr(8221), # right double quotation mark
137 unichr(149): unichr(8226), # bullet
138 unichr(150): unichr(8211), # en dash
139 unichr(151): unichr(8212), # em dash
140 unichr(152): unichr( 732), # small tilde
141 unichr(153): unichr(8482), # trade mark sign
142 unichr(154): unichr( 353), # latin small letter s with caron
143 unichr(155): unichr(8250), # single right-pointing angle quotation mark
144 unichr(156): unichr( 339), # latin small ligature oe
145 unichr(158): unichr( 382), # latin small letter z with caron
146 unichr(159): unichr( 376)} # latin capital letter y with diaeresis
aaab9e1 FOAF reading lists
Sam Ruby authored
147
148 if __name__ == "__main__":
149 # small main program which converts OPML into config.ini format
150 import sys, urllib
151 config = ConfigParser()
152 for opml in sys.argv[1:]:
153 opml2config(urllib.urlopen(opml), config)
154 config.write(sys.stdout)
Something went wrong with that request. Please try again.