Merge pull request #1020 from jojje/gzip_http_cache

[MRG+1] add gzip compression to filesystem http cache backend
scrapy · Mar 17, 2015 · 934584a · 934584a
2 parents f924567 + bd5d99a
commit 934584a
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 9 deletions.
diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
@@ -563,6 +563,18 @@ Default: ``'scrapy.contrib.httpcache.DummyPolicy'``
 
 The class which implements the cache policy.
 
+.. setting:: HTTPCACHE_GZIP
+
+HTTPCACHE_GZIP
+^^^^^^^^^^^^^^
+
+.. versionadded:: 0.25
+
+Default: ``False``
+
+If enabled, will compress all cached data with gzip.
+This setting is specific to the Filesystem backend.
+
 
 HttpCompressionMiddleware
 -------------------------

diff --git a/scrapy/contrib/httpcache.py b/scrapy/contrib/httpcache.py
@@ -1,5 +1,6 @@
 from __future__ import print_function
 import os
+import gzip
 from six.moves import cPickle as pickle
 from importlib import import_module
 from time import time
@@ -220,6 +221,8 @@ class FilesystemCacheStorage(object):
     def __init__(self, settings):
         self.cachedir = data_path(settings['HTTPCACHE_DIR'])
         self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
+        self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
+        self._open = gzip.open if self.use_gzip else open
 
     def open_spider(self, spider):
         pass
@@ -233,9 +236,9 @@ def retrieve_response(self, spider, request):
         if metadata is None:
             return  # not cached
         rpath = self._get_request_path(spider, request)
-        with open(os.path.join(rpath, 'response_body'), 'rb') as f:
+        with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
             body = f.read()
-        with open(os.path.join(rpath, 'response_headers'), 'rb') as f:
+        with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
             rawheaders = f.read()
         url = metadata.get('response_url')
         status = metadata['status']
@@ -256,17 +259,17 @@ def store_response(self, spider, request, response):
             'response_url': response.url,
             'timestamp': time(),
         }
-        with open(os.path.join(rpath, 'meta'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
             f.write(repr(metadata))
-        with open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
             pickle.dump(metadata, f, protocol=2)
-        with open(os.path.join(rpath, 'response_headers'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
             f.write(headers_dict_to_raw(response.headers))
-        with open(os.path.join(rpath, 'response_body'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
             f.write(response.body)
-        with open(os.path.join(rpath, 'request_headers'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
             f.write(headers_dict_to_raw(request.headers))
-        with open(os.path.join(rpath, 'request_body'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
             f.write(request.body)
 
     def _get_request_path(self, spider, request):
@@ -281,7 +284,7 @@ def _read_meta(self, spider, request):
         mtime = os.stat(rpath).st_mtime
         if 0 < self.expiration_secs < time() - mtime:
             return  # expired
-        with open(metapath, 'rb') as f:
+        with self._open(metapath, 'rb') as f:
             return pickle.load(f)
 
 

diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
@@ -154,6 +154,7 @@
 HTTPCACHE_IGNORE_SCHEMES = ['file']
 HTTPCACHE_DBM_MODULE = 'anydbm'
 HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.DummyPolicy'
+HTTPCACHE_GZIP = False
 
 ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
 

diff --git a/tests/test_downloadermiddleware_httpcache.py b/tests/test_downloadermiddleware_httpcache.py
@@ -148,6 +148,11 @@ class FilesystemStorageTest(DefaultStorageTest):
 
     storage_class = 'scrapy.contrib.httpcache.FilesystemCacheStorage'
 
+class FilesystemStorageGzipTest(FilesystemStorageTest):
+
+    def _get_settings(self, **new_settings):
+        new_settings.setdefault('HTTPCACHE_GZIP', True)
+        return super(FilesystemStorageTest, self)._get_settings(**new_settings)
 
 class LeveldbStorageTest(DefaultStorageTest):