Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 152 lines (133 sloc) 6.398 kb
6cc797c Sam Ruby added a new config option: future_dates
authored
1 """
2 Process a set of configuration defined sanitations on a given feed.
3 """
4
5 # Standard library modules
6 import time
7 # Planet modules
8 import planet, config, shell
77d15d2 Sam Ruby xml_base overrides
authored
9 from planet import feedparser
6cc797c Sam Ruby added a new config option: future_dates
authored
10
11 type_map = {'text': 'text/plain', 'html': 'text/html',
12 'xhtml': 'application/xhtml+xml'}
13
14 def scrub(feed_uri, data):
15
16 # some data is not trustworthy
17 for tag in config.ignore_in_feed(feed_uri).split():
18 if tag.find('lang')>=0: tag='language'
19 if data.feed.has_key(tag): del data.feed[tag]
20 for entry in data.entries:
21 if entry.has_key(tag): del entry[tag]
22 if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
23 if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
24 for key in entry.keys():
25 if not key.endswith('_detail'): continue
26 for detail in entry[key].copy():
27 if detail == tag: del entry[key][detail]
28
29 # adjust title types
30 if config.title_type(feed_uri):
31 title_type = config.title_type(feed_uri)
32 title_type = type_map.get(title_type, title_type)
33 for entry in data.entries:
34 if entry.has_key('title_detail'):
35 entry.title_detail['type'] = title_type
36
37 # adjust summary types
38 if config.summary_type(feed_uri):
39 summary_type = config.summary_type(feed_uri)
40 summary_type = type_map.get(summary_type, summary_type)
41 for entry in data.entries:
42 if entry.has_key('summary_detail'):
43 entry.summary_detail['type'] = summary_type
44
45 # adjust content types
46 if config.content_type(feed_uri):
47 content_type = config.content_type(feed_uri)
48 content_type = type_map.get(content_type, content_type)
49 for entry in data.entries:
50 if entry.has_key('content'):
51 entry.content[0]['type'] = content_type
52
53 # some people put html in author names
54 if config.name_type(feed_uri).find('html')>=0:
55 from shell.tmpl import stripHtml
56 if data.feed.has_key('author_detail') and \
57 data.feed.author_detail.has_key('name'):
58 data.feed.author_detail['name'] = \
59 str(stripHtml(data.feed.author_detail.name))
60 for entry in data.entries:
61 if entry.has_key('author_detail') and \
62 entry.author_detail.has_key('name'):
63 entry.author_detail['name'] = \
64 str(stripHtml(entry.author_detail.name))
65 if entry.has_key('source'):
66 source = entry.source
67 if source.has_key('author_detail') and \
68 source.author_detail.has_key('name'):
69 source.author_detail['name'] = \
70 str(stripHtml(source.author_detail.name))
71
72 # handle dates in the future
73 future_dates = config.future_dates(feed_uri).lower()
74 if future_dates == 'ignore_date':
75 now = time.gmtime()
76 if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
77 if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
78 for entry in data.entries:
79 if entry.has_key('published_parsed') and entry['published_parsed']:
80 if entry['published_parsed'] > now:
81 del entry['published_parsed']
82 del entry['published']
83 if entry.has_key('updated_parsed') and entry['updated_parsed']:
84 if entry['updated_parsed'] > now:
85 del entry['updated_parsed']
86 del entry['updated']
87 elif future_dates == 'ignore_entry':
88 now = time.time()
89 if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
90 if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
91 data.entries = [entry for entry in data.entries if
92 (not entry.has_key('published_parsed') or not entry['published_parsed']
93 or entry['published_parsed'] <= now) and
94 (not entry.has_key('updated_parsed') or not entry['updated_parsed']
95 or entry['updated_parsed'] <= now)]
77d15d2 Sam Ruby xml_base overrides
authored
96
97 scrub_xmlbase = config.xml_base(feed_uri)
98
99 # resolve relative URIs and sanitize
100 for entry in data.entries + [data.feed]:
101 for key in entry.keys():
3aef94c Sam Ruby Handle eventful.com atom feeds (w/nested Atom entries), example:
authored
102 if key == 'content'and not entry.has_key('content_detail'):
77d15d2 Sam Ruby xml_base overrides
authored
103 node = entry.content[0]
104 elif key.endswith('_detail'):
105 node = entry[key]
106 else:
107 continue
108
109 if not node.has_key('type'): continue
110 if not 'html' in node['type']: continue
111 if not node.has_key('value'): continue
112
113 if node.has_key('base'):
114 if scrub_xmlbase:
115 if scrub_xmlbase == 'feed_alternate':
116 if entry.has_key('source') and \
117 entry.source.has_key('link'):
118 node['base'] = entry.source.link
119 elif data.feed.has_key('link'):
120 node['base'] = data.feed.link
121 elif scrub_xmlbase == 'entry_alternate':
122 if entry.has_key('link'):
123 node['base'] = entry.link
124 else:
125 node['base'] = feedparser._urljoin(
126 node['base'], scrub_xmlbase)
127
128 node['value'] = feedparser._resolveRelativeURIs(
129 node.value, node.base, 'utf-8', node.type)
130
77970db Sam Ruby resync with html5lib and feedparser
authored
131 # Run this through HTML5's sanitizer
132 doc = None
133 if 'xhtml' in node['type']:
134 try:
135 from xml.dom import minidom
136 doc = minidom.parseString(node['value'])
137 except:
138 node['type']='text/html'
139
140 if not doc:
141 from html5lib import html5parser, treebuilders
142 p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
143 doc = p.parseFragment(node['value'], encoding='utf-8')
144
83447dc Switch from simpleTree to DOM as DOM appears to be more robust
Sam Ruby authored
145 from html5lib import treewalkers, serializer
77970db Sam Ruby resync with html5lib and feedparser
authored
146 from html5lib.filters import sanitizer
147 walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
83447dc Switch from simpleTree to DOM as DOM appears to be more robust
Sam Ruby authored
148 xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
77970db Sam Ruby resync with html5lib and feedparser
authored
149 tree = xhtml.serialize(walker, encoding='utf-8')
150
83447dc Switch from simpleTree to DOM as DOM appears to be more robust
Sam Ruby authored
151 node['value'] = ''.join([str(token) for token in tree])
Something went wrong with that request. Please try again.