Skip to content

Commit

Permalink
Merge pull request #1020 from jojje/gzip_http_cache
Browse files Browse the repository at this point in the history
[MRG+1] add gzip compression to filesystem http cache backend
  • Loading branch information
pablohoffman committed Mar 17, 2015
2 parents f924567 + bd5d99a commit 934584a
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 9 deletions.
12 changes: 12 additions & 0 deletions docs/topics/downloader-middleware.rst
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,18 @@ Default: ``'scrapy.contrib.httpcache.DummyPolicy'``

The class which implements the cache policy.

.. setting:: HTTPCACHE_GZIP

HTTPCACHE_GZIP
^^^^^^^^^^^^^^

.. versionadded:: 0.25

Default: ``False``

If enabled, will compress all cached data with gzip.
This setting is specific to the Filesystem backend.


HttpCompressionMiddleware
-------------------------
Expand Down
21 changes: 12 additions & 9 deletions scrapy/contrib/httpcache.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import print_function
import os
import gzip
from six.moves import cPickle as pickle
from importlib import import_module
from time import time
Expand Down Expand Up @@ -220,6 +221,8 @@ class FilesystemCacheStorage(object):
def __init__(self, settings):
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
self._open = gzip.open if self.use_gzip else open

def open_spider(self, spider):
pass
Expand All @@ -233,9 +236,9 @@ def retrieve_response(self, spider, request):
if metadata is None:
return # not cached
rpath = self._get_request_path(spider, request)
with open(os.path.join(rpath, 'response_body'), 'rb') as f:
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
body = f.read()
with open(os.path.join(rpath, 'response_headers'), 'rb') as f:
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
rawheaders = f.read()
url = metadata.get('response_url')
status = metadata['status']
Expand All @@ -256,17 +259,17 @@ def store_response(self, spider, request, response):
'response_url': response.url,
'timestamp': time(),
}
with open(os.path.join(rpath, 'meta'), 'wb') as f:
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
f.write(repr(metadata))
with open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
pickle.dump(metadata, f, protocol=2)
with open(os.path.join(rpath, 'response_headers'), 'wb') as f:
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
f.write(headers_dict_to_raw(response.headers))
with open(os.path.join(rpath, 'response_body'), 'wb') as f:
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
f.write(response.body)
with open(os.path.join(rpath, 'request_headers'), 'wb') as f:
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
f.write(headers_dict_to_raw(request.headers))
with open(os.path.join(rpath, 'request_body'), 'wb') as f:
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
f.write(request.body)

def _get_request_path(self, spider, request):
Expand All @@ -281,7 +284,7 @@ def _read_meta(self, spider, request):
mtime = os.stat(rpath).st_mtime
if 0 < self.expiration_secs < time() - mtime:
return # expired
with open(metapath, 'rb') as f:
with self._open(metapath, 'rb') as f:
return pickle.load(f)


Expand Down
1 change: 1 addition & 0 deletions scrapy/settings/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@
HTTPCACHE_IGNORE_SCHEMES = ['file']
HTTPCACHE_DBM_MODULE = 'anydbm'
HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.DummyPolicy'
HTTPCACHE_GZIP = False

ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'

Expand Down
5 changes: 5 additions & 0 deletions tests/test_downloadermiddleware_httpcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ class FilesystemStorageTest(DefaultStorageTest):

storage_class = 'scrapy.contrib.httpcache.FilesystemCacheStorage'

class FilesystemStorageGzipTest(FilesystemStorageTest):

def _get_settings(self, **new_settings):
new_settings.setdefault('HTTPCACHE_GZIP', True)
return super(FilesystemStorageTest, self)._get_settings(**new_settings)

class LeveldbStorageTest(DefaultStorageTest):

Expand Down

0 comments on commit 934584a

Please sign in to comment.