Skip to content
Browse files

Re-organize some of the code for munging HTML documents

  • Loading branch information...
1 parent 16273f0 commit d6cc0687924378da1196be86a9d9990727671ef1 @rtyler committed Jun 27, 2010
Showing with 95 additions and 2 deletions.
  1. +95 −2 proxylet.py
View
97 proxylet.py
@@ -1,21 +1,29 @@
#!/usr/bin/env python
from __future__ import with_statement
+import base64
import contextlib
import gzip
import StringIO
import time
import eventlet
+eventlet.monkey_patch()
import eventlet.wsgi
from eventlet.green import httplib
from eventlet.green import urllib2
+import lxml
+import lxml.html
+import memcache
+
PROXIED_HEADERS = ('HTTP_USER_AGENT', 'HTTP_ACCEPT_CHARSET', 'HTTP_ACCEPT',
'HTTP_ACCEPT_LANGUAGE', )#'HTTP_COOKIE', 'HTTP_ACCEPT_CHARSET')
REDIRECT_CODES = (301, 302, 303,)
+CACHE = memcache.Client(('127.0.0.1:11212',))
+
def wsgi_ok(start_response, output, headers):
start_response('200 OK', [(k, v) for k, v in headers.iteritems()])
return [output]
@@ -33,6 +41,57 @@ def fetch_from(method, url, headers):
finally:
print ('fetch_from(%s, %s, ..) took %s' % (method, url, (time.time() - start)))
+
+class Munger(object):
+ def __init__(self, page_content, **kwargs):
+ self.pool = eventlet.GreenPool()
+ self.page_content = page_content
+ self.doc = lxml.html.document_fromstring(page_content)
+
+ def munge(self):
+ for element in self.doc.getiterator():
+ method = '_handle_%s' % element.tag
+ method = getattr(self, method, None)
+ if method is None:
+ continue
+ self.pool.spawn(method, element)
+ self.pool.waitall()
+ return lxml.html.tostring(self.doc)
+
+ def _handle_img(self, elem):
+ if not elem.attrib.get('src'):
+ return elem
+
+ source = elem.attrib['src']
+ image = fetch_from('GET', source, {})
+ image = image.read()
+ b64image = base64.encodestring(image)
+ pieces = source.split('.')
+ elem.attrib['src'] = 'data:image/%s;base64,%s' % (pieces[-1], b64image)
+ return elem
+
+ def _handle_link(self, elem):
+ if not elem.attrib.get('href') or not elem.attrib.get('type') == 'text/css':
+ return elem
+
+ href = elem.attrib['href']
+ css = fetch_from('GET', href, {})
+ css = css.read()
+ b64css = base64.encodestring(css)
+ elem.attrib['href'] = 'data:text/css;base64,%s' % b64css
+ return elem
+
+ def _ignore_handle_script(self, elem):
+ if not elem.attrib.get('src'):
+ return elem
+
+ src = elem.attrib['src']
+ js = fetch_from('GET', src, {})
+ js = js.read()
+ b64js = base64.encodestring(js)
+ elem.attrib['src'] = 'data:text/x-js,%s' % b64js
+ return elem
+
def wsgi_proxy(env, start_response):
if not env['wsgi.url_scheme'] == 'http':
return wsgi_error(start_response, 'Error\r\n', {})
@@ -47,7 +106,18 @@ def wsgi_proxy(env, start_response):
if env['QUERY_STRING']:
url = '%s?%s' % (url, env['QUERY_STRING'])
- response = fetch_from(env['REQUEST_METHOD'], url, headers)
+ cached = False
+ #if CACHE.get(url):
+ if False:
+ print '>>> Getting %s from the cache' % url
+ cached = True
+
+ try:
+ response = fetch_from(env['REQUEST_METHOD'], url, headers)
+ except urllib2.HTTPError, ex:
+ start_response('%s %s' % (ex.getcode(), ex.info()), [])
+ return ['']
+
headers = dict(response.headers)
if response.code in REDIRECT_CODES:
@@ -60,7 +130,30 @@ def wsgi_proxy(env, start_response):
headers.pop('transfer-encoding', None)
print ('headers', headers)
response = response.read()
-
+ parts = url.split('.')
+ suffix = parts[-1]
+ if suffix:
+ suffix = suffix.split('?')[0]
+ munger = None
+ if headers.get('content-type') == 'text/html':
+ munger = Munger(response)
+ response = munger.munge()
+
+ #if not cached and headers.get('cache-control'):
+ if False:
+ parts = headers['cache-control'].split(',')
+ for part in parts:
+ part = part.strip()
+ if not part.startswith('max-age'):
+ continue
+ unused, age = part.split('=')
+ age = int(age)
+ if age <= 0:
+ continue
+ print ('I should cache %s for %ss (%d bytes)' % (url, age, len(response)))
+ CACHE.set(url, response, time=age)
+
+ print ('Sending proxy response for', url)
if response and 'gzip' in env.get('HTTP_ACCEPT_ENCODING', ''):
headers['Content-Encoding'] = 'gzip'
start = time.time()

0 comments on commit d6cc068

Please sign in to comment.
Something went wrong with that request. Please try again.