Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Instead of extending from HttpCachePolicy, following the same approac…

…h used for storage selection
  • Loading branch information...
commit cf5f0203b782425aec470e603853885e2aec641c 1 parent 492831f
@pedrofaustino pedrofaustino authored
View
1  .gitignore
@@ -4,3 +4,4 @@ dropin.cache
docs/build
*egg-info
.tox
+venv
View
59 scrapy/contrib/downloadermiddleware/httpcache.py
@@ -16,44 +16,51 @@
from scrapy.utils.project import data_path
-class HttpCachePolicy(object):
+class DummyPolicy(object):
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
- self.policy = settings.get('HTTPCACHE_POLICY')
+
+ def should_cache_response(self, response):
+ return response.status not in self.ignore_http_codes
+
+ def should_cache_request(self, request):
+ return urlparse_cached(request).scheme not in self.ignore_schemes
- if self.policy == 'dummy':
- self.use_dummy_cache = True
- else:
- self.use_dummy_cache = False
+
+class RFC2616Policy(DummyPolicy):
+ def __init__(self, settings):
+ super(RFC2616Policy, self).__init__(settings)
def should_cache_response(self, response):
- retval = response.status not in self.ignore_http_codes
- if not self.use_dummy_cache and response.headers.has_key('cache-control'):
+ retval = super(RFC2616Policy, self).should_cache_response(response)
+
+ if response.headers.has_key('cache-control'):
retval = retval and (response.headers['cache-control'].lower().find('no-store') == -1)
#retval = retval and self.policy_response(response)
return retval
def should_cache_request(self, request):
- retval = urlparse_cached(request).scheme not in self.ignore_schemes
- if not self.use_dummy_cache and request.headers.has_key('cache-control'):
+ retval = super(RFC2616Policy, self).should_cache_request(request)
+
+ if request.headers.has_key('cache-control'):
retval = retval and (request.headers['cache-control'].lower().find('no-store') == -1)
#retval = retval and self.policy_request(request)
return retval
-class HttpCacheMiddleware(HttpCachePolicy):
+class HttpCacheMiddleware(object):
def __init__(self, settings, stats):
if not settings.getbool('HTTPCACHE_ENABLED'):
raise NotConfigured
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
+ self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
self.stats = stats
- super(HttpCacheMiddleware, self).__init__(settings)
@classmethod
def from_crawler(cls, crawler):
- o = cls.from_settings(crawler.settings, crawler.stats)
+ o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
@@ -65,7 +72,7 @@ def spider_closed(self, spider):
self.storage.close_spider(spider)
def process_request(self, request, spider):
- if not self.should_cache_request(request):
+ if not self.policy.should_cache_request(request):
return
response = self.storage.retrieve_response(spider, request)
@@ -75,14 +82,14 @@ def process_request(self, request, spider):
self.stats.inc_value('httpcache/revalidation', spider=spider)
return
- if response and self.should_cache_response(response):
+ if response and self.policy.should_cache_response(response):
self.stats.inc_value('httpcache/hit', spider=spider)
- if self.use_dummy_cache:
- response.flags.append('cached')
- return response
- else:
+ if isinstance(self.policy, RFC2616Policy):
# Response cached and fresh
raise IgnoreRequest("Ignored request already in cache: %s" % request)
+ else:
+ response.flags.append('cached')
+ return response
# Response not cached
self.stats.inc_value('httpcache/miss', spider=spider)
@@ -90,19 +97,19 @@ def process_request(self, request, spider):
raise IgnoreRequest("Ignored request not in cache: %s" % request)
def process_response(self, request, response, spider):
- if (self.should_cache_request(request)
- and self.should_cache_response(response)):
- if self.use_dummy_cache:
- if 'cached' not in response.flags:
- self.storage.store_response(spider, request, response)
- self.stats.inc_value('httpcache/store', spider=spider)
- else:
+ if (self.policy.should_cache_request(request)
+ and self.policy.should_cache_response(response)):
+ if isinstance(self.policy, RFC2616Policy):
if response.status != 304:
self.storage.store_response(spider, request, response)
self.stats.inc_value('httpcache/store', spider=spider)
else:
response.flags.append('cached')
self.stats.inc_value('httpcache/hit', spider=spider)
+ else:
+ if 'cached' not in response.flags:
+ self.storage.store_response(spider, request, response)
+ self.stats.inc_value('httpcache/store', spider=spider)
return response
View
2  scrapy/settings/default_settings.py
@@ -141,7 +141,7 @@
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_IGNORE_SCHEMES = ['file']
HTTPCACHE_DBM_MODULE = 'anydbm'
-HTTPCACHE_POLICY = 'dummy'
+HTTPCACHE_POLICY = 'scrapy.contrib.downloadermiddleware.httpcache.DummyPolicy'
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
View
47 scrapy/tests/test_downloadermiddleware_httpcache.py
@@ -20,6 +20,9 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
storage_class = FilesystemCacheStorage
realcache_storage_class = DbmRealCacheStorage
+ dummy_policy = 'scrapy.contrib.downloadermiddleware.httpcache.DummyPolicy'
+ rfc2616_policy = 'scrapy.contrib.downloadermiddleware.httpcache.RFC2616Policy'
+
yesterday = email.utils.formatdate(time.time() - 1 * 24 * 60 * 60)
now = email.utils.formatdate()
tomorrow = email.utils.formatdate(time.time() + 1 * 24 * 60 * 60)
@@ -44,7 +47,7 @@ def _get_settings(self, **new_settings):
'HTTPCACHE_DIR': self.tmpdir,
'HTTPCACHE_EXPIRATION_SECS': 1,
'HTTPCACHE_IGNORE_HTTP_CODES': [],
- 'HTTPCACHE_POLICY': 'dummy'
+ 'HTTPCACHE_POLICY': self.dummy_policy
}
settings.update(new_settings)
return Settings(settings)
@@ -52,7 +55,7 @@ def _get_settings(self, **new_settings):
@contextmanager
def _storage(self, **new_settings):
settings = self._get_settings(**new_settings)
- if settings.get('HTTPCACHE_POLICY') == 'dummy':
+ if settings.get('HTTPCACHE_POLICY') == self.dummy_policy:
storage = self.storage_class(settings)
else:
storage = self.realcache_storage_class(settings)
@@ -182,10 +185,10 @@ def test_middleware_ignore_http_codes(self):
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
- def test_real_http_cache_middleware_response304_not_cached(self):
+ def test_middleware_rfc2616policy_response304_not_cached(self):
# test response is not cached because the status is 304 Not Modified
# (so it should be cached already)
- with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
assert mw.process_request(self.request, self.spider) is None
response = Response('http://www.example.com', status=304)
mw.process_response(self.request, response, self.spider)
@@ -194,10 +197,10 @@ def test_real_http_cache_middleware_response304_not_cached(self):
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
- def test_real_http_cache_middleware_response_nostore_not_cached(self):
+ def test_middleware_rfc2616policy_response_nostore_not_cached(self):
# test response is not cached because of the Cache-Control 'no-store' directive
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
- with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
assert mw.process_request(self.request, self.spider) is None
response = Response('http://www.example.com', headers=
{'Content-Type': 'text/html', 'Cache-Control': 'no-store'},
@@ -207,10 +210,10 @@ def test_real_http_cache_middleware_response_nostore_not_cached(self):
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
- def test_real_http_cache_middleware_request_nostore_not_cached(self):
+ def test_middleware_rfc2616policy_request_nostore_not_cached(self):
# test response is not cached because of the request's Cache-Control 'no-store' directive
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
- with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
request = Request('http://www.example.com',
headers={'User-Agent': 'test', 'Cache-Control': 'no-store'})
assert mw.process_request(request, self.spider) is None
@@ -219,16 +222,16 @@ def test_real_http_cache_middleware_request_nostore_not_cached(self):
assert mw.storage.retrieve_response(self.spider, request) is None
assert mw.process_request(request, self.spider) is None
- def test_real_http_cache_middleware_response_cached_and_fresh(self):
+ def test_middleware_rfc2616policy_response_cached_and_fresh(self):
# test response cached and fresh
- with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
response = mw.process_response(self.request, self.response, self.spider)
self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
assert 'cached' not in response.flags
- def test_real_http_cache_middleware_response_cached_and_stale(self):
+ def test_middleware_rfc2616policy_response_cached_and_stale(self):
# test response cached but stale
- with self._middleware(HTTPCACHE_POLICY='rfc2616',
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy,
HTTPCACHE_STORAGE = 'scrapy.contrib.httpcache.DbmRealCacheStorage') as mw:
response = Response('http://www.example.com', headers=
{'Content-Type': 'text/html', 'Cache-Control': 'no-cache'},
@@ -239,10 +242,10 @@ def test_real_http_cache_middleware_response_cached_and_stale(self):
response = mw.storage.retrieve_response(self.spider, self.request)
assert isinstance(response, Request)
- def test_real_http_cache_storage_response_cached_and_fresh(self):
+ def test_storage_rfc2616policy_response_cached_and_fresh(self):
# test response is cached and is fresh
# (response requested should be same as response received)
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
response = Response('http://www.example.com', headers=
@@ -252,10 +255,10 @@ def test_real_http_cache_storage_response_cached_and_fresh(self):
response2 = storage.retrieve_response(self.spider, self.request)
self.assertEqualResponse(response, response2)
- def test_real_http_cache_storage_response403_cached_and_further_requests_ignored(self):
+ def test_storage_rfc2616policy_response403_cached_and_further_requests_ignored(self):
# test response is cached but further requests are ignored
# because response status is 403 (as per the RFC)
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
response = Response('http://www.example.com', headers=
@@ -265,10 +268,10 @@ def test_real_http_cache_storage_response403_cached_and_further_requests_ignored
self.assertRaises(IgnoreRequest, storage.retrieve_response,
self.spider, self.request)
- def test_real_http_cache_storage_response_cached_and_stale(self):
+ def test_storage_rfc2616policy_response_cached_and_stale(self):
# test response is cached and is stale (no cache validators inserted)
# (request should be same as response received)
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
response = Response('http://www.example.com', headers=
@@ -279,9 +282,9 @@ def test_real_http_cache_storage_response_cached_and_stale(self):
assert isinstance(response2, Request)
self.assertEqualRequest(self.request, response2)
- def test_real_http_cache_storage_response_cached_and_stale_with_cache_validators(self):
+ def test_storage_rfc2616policy_response_cached_and_stale_with_cache_validators(self):
# test response is cached and is stale and cache validators are inserted
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
response = Response('http://www.example.com', headers=
@@ -292,10 +295,10 @@ def test_real_http_cache_storage_response_cached_and_stale_with_cache_validators
assert isinstance(response2, Request)
self.assertEqualRequestButWithCacheValidators(self.request, response2)
- def test_real_http_cache_storage_response_cached_and_transparent(self):
+ def test_storage_rfc2616policy_response_cached_and_transparent(self):
# test response is not cached because of the request's Cache-Control 'no-cache' directive
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
request = Request('http://www.example.com',
headers={'User-Agent': 'test', 'Cache-Control': 'no-cache'})
assert storage.retrieve_response(self.spider, request) is None
Please sign in to comment.
Something went wrong with that request. Please try again.