Permalink
Browse files

Instead of extending from HttpCachePolicy, following the same approac…

…h used for storage selection
  • Loading branch information...
1 parent 492831f commit cf5f0203b782425aec470e603853885e2aec641c Pedro Faustino committed Dec 28, 2012
View
@@ -4,3 +4,4 @@ dropin.cache
docs/build
*egg-info
.tox
+venv
@@ -16,44 +16,51 @@
from scrapy.utils.project import data_path
-class HttpCachePolicy(object):
+class DummyPolicy(object):
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
- self.policy = settings.get('HTTPCACHE_POLICY')
+
+ def should_cache_response(self, response):
+ return response.status not in self.ignore_http_codes
+
+ def should_cache_request(self, request):
+ return urlparse_cached(request).scheme not in self.ignore_schemes
- if self.policy == 'dummy':
- self.use_dummy_cache = True
- else:
- self.use_dummy_cache = False
+
+class RFC2616Policy(DummyPolicy):
+ def __init__(self, settings):
+ super(RFC2616Policy, self).__init__(settings)
def should_cache_response(self, response):
- retval = response.status not in self.ignore_http_codes
- if not self.use_dummy_cache and response.headers.has_key('cache-control'):
+ retval = super(RFC2616Policy, self).should_cache_response(response)
+
+ if response.headers.has_key('cache-control'):
retval = retval and (response.headers['cache-control'].lower().find('no-store') == -1)
#retval = retval and self.policy_response(response)
return retval
def should_cache_request(self, request):
- retval = urlparse_cached(request).scheme not in self.ignore_schemes
- if not self.use_dummy_cache and request.headers.has_key('cache-control'):
+ retval = super(RFC2616Policy, self).should_cache_request(request)
+
+ if request.headers.has_key('cache-control'):
retval = retval and (request.headers['cache-control'].lower().find('no-store') == -1)
#retval = retval and self.policy_request(request)
return retval
-class HttpCacheMiddleware(HttpCachePolicy):
+class HttpCacheMiddleware(object):
def __init__(self, settings, stats):
if not settings.getbool('HTTPCACHE_ENABLED'):
raise NotConfigured
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
+ self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
self.stats = stats
- super(HttpCacheMiddleware, self).__init__(settings)
@classmethod
def from_crawler(cls, crawler):
- o = cls.from_settings(crawler.settings, crawler.stats)
+ o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
@@ -65,7 +72,7 @@ def spider_closed(self, spider):
self.storage.close_spider(spider)
def process_request(self, request, spider):
- if not self.should_cache_request(request):
+ if not self.policy.should_cache_request(request):
return
response = self.storage.retrieve_response(spider, request)
@@ -75,34 +82,34 @@ def process_request(self, request, spider):
self.stats.inc_value('httpcache/revalidation', spider=spider)
return
- if response and self.should_cache_response(response):
+ if response and self.policy.should_cache_response(response):
self.stats.inc_value('httpcache/hit', spider=spider)
- if self.use_dummy_cache:
- response.flags.append('cached')
- return response
- else:
+ if isinstance(self.policy, RFC2616Policy):
# Response cached and fresh
raise IgnoreRequest("Ignored request already in cache: %s" % request)
+ else:
+ response.flags.append('cached')
+ return response
# Response not cached
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
raise IgnoreRequest("Ignored request not in cache: %s" % request)
def process_response(self, request, response, spider):
- if (self.should_cache_request(request)
- and self.should_cache_response(response)):
- if self.use_dummy_cache:
- if 'cached' not in response.flags:
- self.storage.store_response(spider, request, response)
- self.stats.inc_value('httpcache/store', spider=spider)
- else:
+ if (self.policy.should_cache_request(request)
+ and self.policy.should_cache_response(response)):
+ if isinstance(self.policy, RFC2616Policy):
if response.status != 304:
self.storage.store_response(spider, request, response)
self.stats.inc_value('httpcache/store', spider=spider)
else:
response.flags.append('cached')
self.stats.inc_value('httpcache/hit', spider=spider)
+ else:
+ if 'cached' not in response.flags:
+ self.storage.store_response(spider, request, response)
+ self.stats.inc_value('httpcache/store', spider=spider)
return response
@@ -141,7 +141,7 @@
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_IGNORE_SCHEMES = ['file']
HTTPCACHE_DBM_MODULE = 'anydbm'
-HTTPCACHE_POLICY = 'dummy'
+HTTPCACHE_POLICY = 'scrapy.contrib.downloadermiddleware.httpcache.DummyPolicy'
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
@@ -20,6 +20,9 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
storage_class = FilesystemCacheStorage
realcache_storage_class = DbmRealCacheStorage
+ dummy_policy = 'scrapy.contrib.downloadermiddleware.httpcache.DummyPolicy'
+ rfc2616_policy = 'scrapy.contrib.downloadermiddleware.httpcache.RFC2616Policy'
+
yesterday = email.utils.formatdate(time.time() - 1 * 24 * 60 * 60)
now = email.utils.formatdate()
tomorrow = email.utils.formatdate(time.time() + 1 * 24 * 60 * 60)
@@ -44,15 +47,15 @@ def _get_settings(self, **new_settings):
'HTTPCACHE_DIR': self.tmpdir,
'HTTPCACHE_EXPIRATION_SECS': 1,
'HTTPCACHE_IGNORE_HTTP_CODES': [],
- 'HTTPCACHE_POLICY': 'dummy'
+ 'HTTPCACHE_POLICY': self.dummy_policy
}
settings.update(new_settings)
return Settings(settings)
@contextmanager
def _storage(self, **new_settings):
settings = self._get_settings(**new_settings)
- if settings.get('HTTPCACHE_POLICY') == 'dummy':
+ if settings.get('HTTPCACHE_POLICY') == self.dummy_policy:
storage = self.storage_class(settings)
else:
storage = self.realcache_storage_class(settings)
@@ -182,10 +185,10 @@ def test_middleware_ignore_http_codes(self):
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
- def test_real_http_cache_middleware_response304_not_cached(self):
+ def test_middleware_rfc2616policy_response304_not_cached(self):
# test response is not cached because the status is 304 Not Modified
# (so it should be cached already)
- with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
assert mw.process_request(self.request, self.spider) is None
response = Response('http://www.example.com', status=304)
mw.process_response(self.request, response, self.spider)
@@ -194,10 +197,10 @@ def test_real_http_cache_middleware_response304_not_cached(self):
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
- def test_real_http_cache_middleware_response_nostore_not_cached(self):
+ def test_middleware_rfc2616policy_response_nostore_not_cached(self):
# test response is not cached because of the Cache-Control 'no-store' directive
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
- with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
assert mw.process_request(self.request, self.spider) is None
response = Response('http://www.example.com', headers=
{'Content-Type': 'text/html', 'Cache-Control': 'no-store'},
@@ -207,10 +210,10 @@ def test_real_http_cache_middleware_response_nostore_not_cached(self):
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
- def test_real_http_cache_middleware_request_nostore_not_cached(self):
+ def test_middleware_rfc2616policy_request_nostore_not_cached(self):
# test response is not cached because of the request's Cache-Control 'no-store' directive
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
- with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
request = Request('http://www.example.com',
headers={'User-Agent': 'test', 'Cache-Control': 'no-store'})
assert mw.process_request(request, self.spider) is None
@@ -219,16 +222,16 @@ def test_real_http_cache_middleware_request_nostore_not_cached(self):
assert mw.storage.retrieve_response(self.spider, request) is None
assert mw.process_request(request, self.spider) is None
- def test_real_http_cache_middleware_response_cached_and_fresh(self):
+ def test_middleware_rfc2616policy_response_cached_and_fresh(self):
# test response cached and fresh
- with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
response = mw.process_response(self.request, self.response, self.spider)
self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
assert 'cached' not in response.flags
- def test_real_http_cache_middleware_response_cached_and_stale(self):
+ def test_middleware_rfc2616policy_response_cached_and_stale(self):
# test response cached but stale
- with self._middleware(HTTPCACHE_POLICY='rfc2616',
+ with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy,
HTTPCACHE_STORAGE = 'scrapy.contrib.httpcache.DbmRealCacheStorage') as mw:
response = Response('http://www.example.com', headers=
{'Content-Type': 'text/html', 'Cache-Control': 'no-cache'},
@@ -239,10 +242,10 @@ def test_real_http_cache_middleware_response_cached_and_stale(self):
response = mw.storage.retrieve_response(self.spider, self.request)
assert isinstance(response, Request)
- def test_real_http_cache_storage_response_cached_and_fresh(self):
+ def test_storage_rfc2616policy_response_cached_and_fresh(self):
# test response is cached and is fresh
# (response requested should be same as response received)
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
response = Response('http://www.example.com', headers=
@@ -252,10 +255,10 @@ def test_real_http_cache_storage_response_cached_and_fresh(self):
response2 = storage.retrieve_response(self.spider, self.request)
self.assertEqualResponse(response, response2)
- def test_real_http_cache_storage_response403_cached_and_further_requests_ignored(self):
+ def test_storage_rfc2616policy_response403_cached_and_further_requests_ignored(self):
# test response is cached but further requests are ignored
# because response status is 403 (as per the RFC)
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
response = Response('http://www.example.com', headers=
@@ -265,10 +268,10 @@ def test_real_http_cache_storage_response403_cached_and_further_requests_ignored
self.assertRaises(IgnoreRequest, storage.retrieve_response,
self.spider, self.request)
- def test_real_http_cache_storage_response_cached_and_stale(self):
+ def test_storage_rfc2616policy_response_cached_and_stale(self):
# test response is cached and is stale (no cache validators inserted)
# (request should be same as response received)
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
response = Response('http://www.example.com', headers=
@@ -279,9 +282,9 @@ def test_real_http_cache_storage_response_cached_and_stale(self):
assert isinstance(response2, Request)
self.assertEqualRequest(self.request, response2)
- def test_real_http_cache_storage_response_cached_and_stale_with_cache_validators(self):
+ def test_storage_rfc2616policy_response_cached_and_stale_with_cache_validators(self):
# test response is cached and is stale and cache validators are inserted
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
response = Response('http://www.example.com', headers=
@@ -292,10 +295,10 @@ def test_real_http_cache_storage_response_cached_and_stale_with_cache_validators
assert isinstance(response2, Request)
self.assertEqualRequestButWithCacheValidators(self.request, response2)
- def test_real_http_cache_storage_response_cached_and_transparent(self):
+ def test_storage_rfc2616policy_response_cached_and_transparent(self):
# test response is not cached because of the request's Cache-Control 'no-cache' directive
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
- with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
+ with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
request = Request('http://www.example.com',
headers={'User-Agent': 'test', 'Cache-Control': 'no-cache'})
assert storage.retrieve_response(self.spider, request) is None

0 comments on commit cf5f020

Please sign in to comment.