Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 177 lines (145 sloc) 5.618 kb
16273f0 @rtyler Add first pass of proxylet server
authored
1 #!/usr/bin/env python
2 from __future__ import with_statement
3
d6cc068 @rtyler Re-organize some of the code for munging HTML documents
authored
4 import base64
16273f0 @rtyler Add first pass of proxylet server
authored
5 import contextlib
6 import gzip
7 import StringIO
8 import time
9
10 import eventlet
d6cc068 @rtyler Re-organize some of the code for munging HTML documents
authored
11 eventlet.monkey_patch()
16273f0 @rtyler Add first pass of proxylet server
authored
12 import eventlet.wsgi
13
14 from eventlet.green import httplib
15 from eventlet.green import urllib2
16
d6cc068 @rtyler Re-organize some of the code for munging HTML documents
authored
17 import lxml
18 import lxml.html
19 import memcache
20
16273f0 @rtyler Add first pass of proxylet server
authored
21 PROXIED_HEADERS = ('HTTP_USER_AGENT', 'HTTP_ACCEPT_CHARSET', 'HTTP_ACCEPT',
22 'HTTP_ACCEPT_LANGUAGE', )#'HTTP_COOKIE', 'HTTP_ACCEPT_CHARSET')
23 REDIRECT_CODES = (301, 302, 303,)
24
d6cc068 @rtyler Re-organize some of the code for munging HTML documents
authored
25 CACHE = memcache.Client(('127.0.0.1:11212',))
26
16273f0 @rtyler Add first pass of proxylet server
authored
27 def wsgi_ok(start_response, output, headers):
28 start_response('200 OK', [(k, v) for k, v in headers.iteritems()])
29 return [output]
30
31 def wsgi_error(start_response, output, headers):
32 start_response('500 Server Error', [(k, v) for k, v in headers.iteritems()])
33 return [output]
34
35 def fetch_from(method, url, headers):
36 print '>> Requesting: %s' % url
37 start = time.time()
38 request = urllib2.Request(url=url, headers=headers)
39 try:
40 return urllib2.urlopen(request)
41 finally:
42 print ('fetch_from(%s, %s, ..) took %s' % (method, url, (time.time() - start)))
43
d6cc068 @rtyler Re-organize some of the code for munging HTML documents
authored
44
45 class Munger(object):
46 def __init__(self, page_content, **kwargs):
47 self.pool = eventlet.GreenPool()
48 self.page_content = page_content
49 self.doc = lxml.html.document_fromstring(page_content)
50
51 def munge(self):
52 for element in self.doc.getiterator():
53 method = '_handle_%s' % element.tag
54 method = getattr(self, method, None)
55 if method is None:
56 continue
57 self.pool.spawn(method, element)
58 self.pool.waitall()
59 return lxml.html.tostring(self.doc)
60
61 def _handle_img(self, elem):
62 if not elem.attrib.get('src'):
63 return elem
64
65 source = elem.attrib['src']
66 image = fetch_from('GET', source, {})
67 image = image.read()
68 b64image = base64.encodestring(image)
69 pieces = source.split('.')
70 elem.attrib['src'] = 'data:image/%s;base64,%s' % (pieces[-1], b64image)
71 return elem
72
73 def _handle_link(self, elem):
74 if not elem.attrib.get('href') or not elem.attrib.get('type') == 'text/css':
75 return elem
76
77 href = elem.attrib['href']
78 css = fetch_from('GET', href, {})
79 css = css.read()
80 b64css = base64.encodestring(css)
81 elem.attrib['href'] = 'data:text/css;base64,%s' % b64css
82 return elem
83
84 def _ignore_handle_script(self, elem):
85 if not elem.attrib.get('src'):
86 return elem
87
88 src = elem.attrib['src']
89 js = fetch_from('GET', src, {})
90 js = js.read()
91 b64js = base64.encodestring(js)
92 elem.attrib['src'] = 'data:text/x-js,%s' % b64js
93 return elem
94
16273f0 @rtyler Add first pass of proxylet server
authored
95 def wsgi_proxy(env, start_response):
96 if not env['wsgi.url_scheme'] == 'http':
97 return wsgi_error(start_response, 'Error\r\n', {})
98
99 if not env['REQUEST_METHOD'] == 'GET':
100 return wsgi_error(start_response, 'Only GET is suppported\r\n', {})
101
102 # Strip off early 'http://'
103 url = env['PATH_INFO']
104 headers = dict(((k, env[k]) for k in PROXIED_HEADERS if env.has_key(k)))
105
106 if env['QUERY_STRING']:
107 url = '%s?%s' % (url, env['QUERY_STRING'])
108
d6cc068 @rtyler Re-organize some of the code for munging HTML documents
authored
109 cached = False
110 #if CACHE.get(url):
111 if False:
112 print '>>> Getting %s from the cache' % url
113 cached = True
114
115 try:
116 response = fetch_from(env['REQUEST_METHOD'], url, headers)
117 except urllib2.HTTPError, ex:
118 start_response('%s %s' % (ex.getcode(), ex.info()), [])
119 return ['']
120
16273f0 @rtyler Add first pass of proxylet server
authored
121 headers = dict(response.headers)
122
123 if response.code in REDIRECT_CODES:
124 if not headers.get('location'):
125 return wsgi_error(start_response, 'No Location header given with redirect code %d\r\n' % response.code, {})
126 print ('Redirecting', env['PATH_INFO'], headers['location'])
127 env.update({'PATH_INFO' : headers['location']})
128 return wsgi_proxy(env, start_response)
129
130 headers.pop('transfer-encoding', None)
131 print ('headers', headers)
132 response = response.read()
d6cc068 @rtyler Re-organize some of the code for munging HTML documents
authored
133 parts = url.split('.')
134 suffix = parts[-1]
135 if suffix:
136 suffix = suffix.split('?')[0]
137 munger = None
138 if headers.get('content-type') == 'text/html':
139 munger = Munger(response)
140 response = munger.munge()
141
142 #if not cached and headers.get('cache-control'):
143 if False:
144 parts = headers['cache-control'].split(',')
145 for part in parts:
146 part = part.strip()
147 if not part.startswith('max-age'):
148 continue
149 unused, age = part.split('=')
150 age = int(age)
151 if age <= 0:
152 continue
153 print ('I should cache %s for %ss (%d bytes)' % (url, age, len(response)))
154 CACHE.set(url, response, time=age)
155
156 print ('Sending proxy response for', url)
16273f0 @rtyler Add first pass of proxylet server
authored
157 if response and 'gzip' in env.get('HTTP_ACCEPT_ENCODING', ''):
158 headers['Content-Encoding'] = 'gzip'
159 start = time.time()
160 out = StringIO.StringIO()
161 gzout = gzip.GzipFile(None, 'wb', 9, fileobj=out)
162 gzout.write(response)
163 gzout.close()
164 response = out.getvalue()
165 print ('gzipping took', (time.time() - start))
166 print '>> Returning %d bytes for %s' % (len(response), url)
167 return wsgi_ok(start_response, response, headers)
168
169 def main():
170 eventlet.wsgi.server(eventlet.listen(('localhost', 8199)), wsgi_proxy,
171 log_x_forwarded_for=True, keepalive=False,
172 max_size=1024)
173 return 0
174
175 if __name__ == '__main__':
176 exit(main())
Something went wrong with that request. Please try again.