This repository has been archived by the owner on Nov 9, 2017. It is now read-only.
/
filters.py
241 lines (198 loc) · 7.01 KB
/
filters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is reddit.
#
# The Original Developer is the Initial Developer. The Initial Developer of
# the Original Code is reddit Inc.
#
# All portions of the code written by reddit are Copyright (c) 2006-2012 reddit
# Inc. All Rights Reserved.
###############################################################################
import cgi
import os
import urllib
import re
import snudown
from cStringIO import StringIO
from xml.sax.handler import ContentHandler
from lxml.sax import saxify
import lxml.etree
from BeautifulSoup import BeautifulSoup
from pylons import g, c
from wrapped import Templated, CacheStub
SC_OFF = "<!-- SC_OFF -->"
SC_ON = "<!-- SC_ON -->"
MD_START = '<div class="md">'
MD_END = '</div>'
def python_websafe(text):
return text.replace('&', "&").replace("<", "<").replace(">", ">").replace('"', """)
def python_websafe_json(text):
return text.replace('&', "&").replace("<", "<").replace(">", ">")
try:
from Cfilters import uwebsafe as c_websafe, uspace_compress, \
uwebsafe_json as c_websafe_json
def spaceCompress(text):
try:
text = unicode(text, 'utf-8')
except TypeError:
text = unicode(text)
return uspace_compress(text)
except ImportError:
c_websafe = python_websafe
c_websafe_json = python_websafe_json
_between_tags1 = re.compile('> +')
_between_tags2 = re.compile(' +<')
_spaces = re.compile('[\s]+')
_ignore = re.compile('(' + SC_OFF + '|' + SC_ON + ')', re.S | re.I)
def spaceCompress(content):
res = ''
sc = True
for p in _ignore.split(content):
if p == SC_ON:
sc = True
elif p == SC_OFF:
sc = False
elif sc:
p = _spaces.sub(' ', p)
p = _between_tags1.sub('>', p)
p = _between_tags2.sub('<', p)
res += p
else:
res += p
return res
class _Unsafe(unicode): pass
def _force_unicode(text):
if text == None:
return u''
if isinstance(text, unicode):
return text
try:
text = unicode(text, 'utf-8')
except UnicodeDecodeError:
text = unicode(text, 'latin1')
except TypeError:
text = unicode(text)
return text
def _force_utf8(text):
return str(_force_unicode(text).encode('utf8'))
def unsafe(text=''):
return _Unsafe(_force_unicode(text))
def websafe_json(text=""):
return c_websafe_json(_force_unicode(text))
def mako_websafe(text = ''):
if text.__class__ == _Unsafe:
return text
elif isinstance(text, Templated):
return _Unsafe(text.render())
elif isinstance(text, CacheStub):
return _Unsafe(text)
elif text is None:
return ""
elif text.__class__ != unicode:
text = _force_unicode(text)
return c_websafe(text)
def websafe(text=''):
if text.__class__ != unicode:
text = _force_unicode(text)
#wrap the response in _Unsafe so make_websafe doesn't unescape it
return _Unsafe(c_websafe(text))
from mako.filters import url_escape
def edit_comment_filter(text = ''):
try:
text = unicode(text, 'utf-8')
except TypeError:
text = unicode(text)
return url_escape(text)
valid_link_schemes = (
'/',
'#',
'http://',
'https://',
'ftp://',
'mailto:',
'steam://',
'irc://',
'ircs://',
'news://',
'mumble://',
'ssh://',
'git://',
)
class SouptestSaxHandler(ContentHandler):
def __init__(self, ok_tags):
self.ok_tags = ok_tags
def startElementNS(self, tagname, qname, attrs):
if qname not in self.ok_tags:
raise ValueError('HAX: Unknown tag: %r' % qname)
for (ns, name), val in attrs.items():
if ns is not None:
raise ValueError('HAX: Unknown namespace? Seriously? %r' % ns)
if name not in self.ok_tags[qname]:
raise ValueError('HAX: Unknown attribute-name %r' % name)
if qname == 'a' and name == 'href':
lv = val.lower()
if not any(lv.startswith(scheme) for scheme in valid_link_schemes):
raise ValueError('HAX: Unsupported link scheme %r' % val)
markdown_ok_tags = {
'div': ('class'),
'a': set(('href', 'title', 'target', 'nofollow')),
'table': ("align", ),
'th': ("align", ),
'td': ("align", ),
}
markdown_boring_tags = ('p', 'em', 'strong', 'br', 'ol', 'ul', 'hr', 'li',
'pre', 'code', 'blockquote', 'center',
'tbody', 'thead', 'tr', 'sup', 'del',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',)
for bt in markdown_boring_tags:
markdown_ok_tags[bt] = ()
markdown_xhtml_dtd_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'contrib/dtds/xhtml.dtd')
markdown_dtd = '<!DOCTYPE div- SYSTEM "file://%s">' % markdown_xhtml_dtd_path
def markdown_souptest(text, nofollow=False, target=None):
if not text:
return text
smd = safemarkdown(text, nofollow=nofollow, target=target)
# Prepend a DTD reference so we can load up definitions of all the standard
# XHTML entities ( , etc.).
smd_with_dtd = markdown_dtd + smd
s = StringIO(smd_with_dtd)
parser = lxml.etree.XMLParser(load_dtd=True)
tree = lxml.etree.parse(s, parser)
handler = SouptestSaxHandler(markdown_ok_tags)
saxify(tree, handler)
return smd
#TODO markdown should be looked up in batch?
#@memoize('markdown')
def safemarkdown(text, nofollow=False, wrap=True, **kwargs):
if not text:
return None
# this lets us skip the c.cname lookup (which is apparently quite
# slow) if target was explicitly passed to this function.
target = kwargs.get("target", None)
if "target" not in kwargs and c.cname:
target = "_top"
text = snudown.markdown(_force_utf8(text), nofollow, target)
if wrap:
return SC_OFF + MD_START + text + MD_END + SC_ON
else:
return SC_OFF + text + SC_ON
def keep_space(text):
text = websafe(text)
for i in " \n\r\t":
text=text.replace(i,'&#%02d;' % ord(i))
return unsafe(text)
def unkeep_space(text):
return text.replace(' ', ' ').replace(' ', '\n').replace('	', '\t')