Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Re-organize some of the code for munging HTML documents

  • Loading branch information...
commit d6cc0687924378da1196be86a9d9990727671ef1 1 parent 16273f0
R. Tyler Croy authored

Showing 1 changed file with 95 additions and 2 deletions. Show diff stats Hide diff stats

  1. +95 2 proxylet.py
97 proxylet.py
... ... @@ -1,21 +1,29 @@
1 1 #!/usr/bin/env python
2 2 from __future__ import with_statement
3 3
  4 +import base64
4 5 import contextlib
5 6 import gzip
6 7 import StringIO
7 8 import time
8 9
9 10 import eventlet
  11 +eventlet.monkey_patch()
10 12 import eventlet.wsgi
11 13
12 14 from eventlet.green import httplib
13 15 from eventlet.green import urllib2
14 16
  17 +import lxml
  18 +import lxml.html
  19 +import memcache
  20 +
15 21 PROXIED_HEADERS = ('HTTP_USER_AGENT', 'HTTP_ACCEPT_CHARSET', 'HTTP_ACCEPT',
16 22 'HTTP_ACCEPT_LANGUAGE', )#'HTTP_COOKIE', 'HTTP_ACCEPT_CHARSET')
17 23 REDIRECT_CODES = (301, 302, 303,)
18 24
  25 +CACHE = memcache.Client(('127.0.0.1:11212',))
  26 +
19 27 def wsgi_ok(start_response, output, headers):
20 28 start_response('200 OK', [(k, v) for k, v in headers.iteritems()])
21 29 return [output]
@@ -33,6 +41,57 @@ def fetch_from(method, url, headers):
33 41 finally:
34 42 print ('fetch_from(%s, %s, ..) took %s' % (method, url, (time.time() - start)))
35 43
  44 +
  45 +class Munger(object):
  46 + def __init__(self, page_content, **kwargs):
  47 + self.pool = eventlet.GreenPool()
  48 + self.page_content = page_content
  49 + self.doc = lxml.html.document_fromstring(page_content)
  50 +
  51 + def munge(self):
  52 + for element in self.doc.getiterator():
  53 + method = '_handle_%s' % element.tag
  54 + method = getattr(self, method, None)
  55 + if method is None:
  56 + continue
  57 + self.pool.spawn(method, element)
  58 + self.pool.waitall()
  59 + return lxml.html.tostring(self.doc)
  60 +
  61 + def _handle_img(self, elem):
  62 + if not elem.attrib.get('src'):
  63 + return elem
  64 +
  65 + source = elem.attrib['src']
  66 + image = fetch_from('GET', source, {})
  67 + image = image.read()
  68 + b64image = base64.encodestring(image)
  69 + pieces = source.split('.')
  70 + elem.attrib['src'] = 'data:image/%s;base64,%s' % (pieces[-1], b64image)
  71 + return elem
  72 +
  73 + def _handle_link(self, elem):
  74 + if not elem.attrib.get('href') or not elem.attrib.get('type') == 'text/css':
  75 + return elem
  76 +
  77 + href = elem.attrib['href']
  78 + css = fetch_from('GET', href, {})
  79 + css = css.read()
  80 + b64css = base64.encodestring(css)
  81 + elem.attrib['href'] = 'data:text/css;base64,%s' % b64css
  82 + return elem
  83 +
  84 + def _ignore_handle_script(self, elem):
  85 + if not elem.attrib.get('src'):
  86 + return elem
  87 +
  88 + src = elem.attrib['src']
  89 + js = fetch_from('GET', src, {})
  90 + js = js.read()
  91 + b64js = base64.encodestring(js)
  92 + elem.attrib['src'] = 'data:text/x-js,%s' % b64js
  93 + return elem
  94 +
36 95 def wsgi_proxy(env, start_response):
37 96 if not env['wsgi.url_scheme'] == 'http':
38 97 return wsgi_error(start_response, 'Error\r\n', {})
@@ -47,7 +106,18 @@ def wsgi_proxy(env, start_response):
47 106 if env['QUERY_STRING']:
48 107 url = '%s?%s' % (url, env['QUERY_STRING'])
49 108
50   - response = fetch_from(env['REQUEST_METHOD'], url, headers)
  109 + cached = False
  110 + #if CACHE.get(url):
  111 + if False:
  112 + print '>>> Getting %s from the cache' % url
  113 + cached = True
  114 +
  115 + try:
  116 + response = fetch_from(env['REQUEST_METHOD'], url, headers)
  117 + except urllib2.HTTPError, ex:
  118 + start_response('%s %s' % (ex.getcode(), ex.info()), [])
  119 + return ['']
  120 +
51 121 headers = dict(response.headers)
52 122
53 123 if response.code in REDIRECT_CODES:
@@ -60,7 +130,30 @@ def wsgi_proxy(env, start_response):
60 130 headers.pop('transfer-encoding', None)
61 131 print ('headers', headers)
62 132 response = response.read()
63   -
  133 + parts = url.split('.')
  134 + suffix = parts[-1]
  135 + if suffix:
  136 + suffix = suffix.split('?')[0]
  137 + munger = None
  138 + if headers.get('content-type') == 'text/html':
  139 + munger = Munger(response)
  140 + response = munger.munge()
  141 +
  142 + #if not cached and headers.get('cache-control'):
  143 + if False:
  144 + parts = headers['cache-control'].split(',')
  145 + for part in parts:
  146 + part = part.strip()
  147 + if not part.startswith('max-age'):
  148 + continue
  149 + unused, age = part.split('=')
  150 + age = int(age)
  151 + if age <= 0:
  152 + continue
  153 + print ('I should cache %s for %ss (%d bytes)' % (url, age, len(response)))
  154 + CACHE.set(url, response, time=age)
  155 +
  156 + print ('Sending proxy response for', url)
64 157 if response and 'gzip' in env.get('HTTP_ACCEPT_ENCODING', ''):
65 158 headers['Content-Encoding'] = 'gzip'
66 159 start = time.time()

0 comments on commit d6cc068

Please sign in to comment.
Something went wrong with that request. Please try again.