Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 152 lines (133 sloc) 6.398 kB
6cc797c @rubys added a new config option: future_dates
authored
1 """
2 Process a set of configuration defined sanitations on a given feed.
3 """
4
5 # Standard library modules
6 import time
7 # Planet modules
8 import planet, config, shell
77d15d2 @rubys xml_base overrides
authored
9 from planet import feedparser
6cc797c @rubys added a new config option: future_dates
authored
10
11 type_map = {'text': 'text/plain', 'html': 'text/html',
12 'xhtml': 'application/xhtml+xml'}
13
14 def scrub(feed_uri, data):
15
16 # some data is not trustworthy
17 for tag in config.ignore_in_feed(feed_uri).split():
18 if tag.find('lang')>=0: tag='language'
19 if data.feed.has_key(tag): del data.feed[tag]
20 for entry in data.entries:
21 if entry.has_key(tag): del entry[tag]
22 if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
23 if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
24 for key in entry.keys():
25 if not key.endswith('_detail'): continue
26 for detail in entry[key].copy():
27 if detail == tag: del entry[key][detail]
28
29 # adjust title types
30 if config.title_type(feed_uri):
31 title_type = config.title_type(feed_uri)
32 title_type = type_map.get(title_type, title_type)
33 for entry in data.entries:
34 if entry.has_key('title_detail'):
35 entry.title_detail['type'] = title_type
36
37 # adjust summary types
38 if config.summary_type(feed_uri):
39 summary_type = config.summary_type(feed_uri)
40 summary_type = type_map.get(summary_type, summary_type)
41 for entry in data.entries:
42 if entry.has_key('summary_detail'):
43 entry.summary_detail['type'] = summary_type
44
45 # adjust content types
46 if config.content_type(feed_uri):
47 content_type = config.content_type(feed_uri)
48 content_type = type_map.get(content_type, content_type)
49 for entry in data.entries:
50 if entry.has_key('content'):
51 entry.content[0]['type'] = content_type
52
53 # some people put html in author names
54 if config.name_type(feed_uri).find('html')>=0:
55 from shell.tmpl import stripHtml
56 if data.feed.has_key('author_detail') and \
57 data.feed.author_detail.has_key('name'):
58 data.feed.author_detail['name'] = \
59 str(stripHtml(data.feed.author_detail.name))
60 for entry in data.entries:
61 if entry.has_key('author_detail') and \
62 entry.author_detail.has_key('name'):
63 entry.author_detail['name'] = \
64 str(stripHtml(entry.author_detail.name))
65 if entry.has_key('source'):
66 source = entry.source
67 if source.has_key('author_detail') and \
68 source.author_detail.has_key('name'):
69 source.author_detail['name'] = \
70 str(stripHtml(source.author_detail.name))
71
72 # handle dates in the future
73 future_dates = config.future_dates(feed_uri).lower()
74 if future_dates == 'ignore_date':
75 now = time.gmtime()
76 if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
77 if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
78 for entry in data.entries:
79 if entry.has_key('published_parsed') and entry['published_parsed']:
80 if entry['published_parsed'] > now:
81 del entry['published_parsed']
82 del entry['published']
83 if entry.has_key('updated_parsed') and entry['updated_parsed']:
84 if entry['updated_parsed'] > now:
85 del entry['updated_parsed']
86 del entry['updated']
87 elif future_dates == 'ignore_entry':
88 now = time.time()
89 if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
90 if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
91 data.entries = [entry for entry in data.entries if
92 (not entry.has_key('published_parsed') or not entry['published_parsed']
93 or entry['published_parsed'] <= now) and
94 (not entry.has_key('updated_parsed') or not entry['updated_parsed']
95 or entry['updated_parsed'] <= now)]
77d15d2 @rubys xml_base overrides
authored
96
97 scrub_xmlbase = config.xml_base(feed_uri)
98
99 # resolve relative URIs and sanitize
100 for entry in data.entries + [data.feed]:
101 for key in entry.keys():
3aef94c @rubys Handle eventful.com atom feeds (w/nested Atom entries), example:
authored
102 if key == 'content'and not entry.has_key('content_detail'):
77d15d2 @rubys xml_base overrides
authored
103 node = entry.content[0]
104 elif key.endswith('_detail'):
105 node = entry[key]
106 else:
107 continue
108
109 if not node.has_key('type'): continue
110 if not 'html' in node['type']: continue
111 if not node.has_key('value'): continue
112
113 if node.has_key('base'):
114 if scrub_xmlbase:
115 if scrub_xmlbase == 'feed_alternate':
116 if entry.has_key('source') and \
117 entry.source.has_key('link'):
118 node['base'] = entry.source.link
119 elif data.feed.has_key('link'):
120 node['base'] = data.feed.link
121 elif scrub_xmlbase == 'entry_alternate':
122 if entry.has_key('link'):
123 node['base'] = entry.link
124 else:
125 node['base'] = feedparser._urljoin(
126 node['base'], scrub_xmlbase)
127
128 node['value'] = feedparser._resolveRelativeURIs(
129 node.value, node.base, 'utf-8', node.type)
130
77970db @rubys resync with html5lib and feedparser
authored
131 # Run this through HTML5's sanitizer
132 doc = None
133 if 'xhtml' in node['type']:
134 try:
135 from xml.dom import minidom
136 doc = minidom.parseString(node['value'])
137 except:
138 node['type']='text/html'
139
140 if not doc:
141 from html5lib import html5parser, treebuilders
142 p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
143 doc = p.parseFragment(node['value'], encoding='utf-8')
144
83447dc Switch from simpleTree to DOM as DOM appears to be more robust
Sam Ruby authored
145 from html5lib import treewalkers, serializer
77970db @rubys resync with html5lib and feedparser
authored
146 from html5lib.filters import sanitizer
147 walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
83447dc Switch from simpleTree to DOM as DOM appears to be more robust
Sam Ruby authored
148 xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
77970db @rubys resync with html5lib and feedparser
authored
149 tree = xhtml.serialize(walker, encoding='utf-8')
150
83447dc Switch from simpleTree to DOM as DOM appears to be more robust
Sam Ruby authored
151 node['value'] = ''.join([str(token) for token in tree])
Something went wrong with that request. Please try again.