Skip to content
This repository
Browse code

Instead of extending from HttpCachePolicy, following the same approac…

…h used for storage selection
  • Loading branch information...
commit cf5f0203b782425aec470e603853885e2aec641c 1 parent 492831f
pedrofaustino pedrofaustino authored
1  .gitignore
@@ -4,3 +4,4 @@ dropin.cache
4 4 docs/build
5 5 *egg-info
6 6 .tox
  7 +venv
59 scrapy/contrib/downloadermiddleware/httpcache.py
@@ -16,44 +16,51 @@
16 16 from scrapy.utils.project import data_path
17 17
18 18
19   -class HttpCachePolicy(object):
  19 +class DummyPolicy(object):
20 20 def __init__(self, settings):
21 21 self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
22 22 self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
23   - self.policy = settings.get('HTTPCACHE_POLICY')
  23 +
  24 + def should_cache_response(self, response):
  25 + return response.status not in self.ignore_http_codes
  26 +
  27 + def should_cache_request(self, request):
  28 + return urlparse_cached(request).scheme not in self.ignore_schemes
24 29
25   - if self.policy == 'dummy':
26   - self.use_dummy_cache = True
27   - else:
28   - self.use_dummy_cache = False
  30 +
  31 +class RFC2616Policy(DummyPolicy):
  32 + def __init__(self, settings):
  33 + super(RFC2616Policy, self).__init__(settings)
29 34
30 35 def should_cache_response(self, response):
31   - retval = response.status not in self.ignore_http_codes
32   - if not self.use_dummy_cache and response.headers.has_key('cache-control'):
  36 + retval = super(RFC2616Policy, self).should_cache_response(response)
  37 +
  38 + if response.headers.has_key('cache-control'):
33 39 retval = retval and (response.headers['cache-control'].lower().find('no-store') == -1)
34 40 #retval = retval and self.policy_response(response)
35 41 return retval
36 42
37 43 def should_cache_request(self, request):
38   - retval = urlparse_cached(request).scheme not in self.ignore_schemes
39   - if not self.use_dummy_cache and request.headers.has_key('cache-control'):
  44 + retval = super(RFC2616Policy, self).should_cache_request(request)
  45 +
  46 + if request.headers.has_key('cache-control'):
40 47 retval = retval and (request.headers['cache-control'].lower().find('no-store') == -1)
41 48 #retval = retval and self.policy_request(request)
42 49 return retval
43 50
44   -class HttpCacheMiddleware(HttpCachePolicy):
  51 +class HttpCacheMiddleware(object):
45 52
46 53 def __init__(self, settings, stats):
47 54 if not settings.getbool('HTTPCACHE_ENABLED'):
48 55 raise NotConfigured
49 56 self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
50 57 self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
  58 + self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
51 59 self.stats = stats
52   - super(HttpCacheMiddleware, self).__init__(settings)
53 60
54 61 @classmethod
55 62 def from_crawler(cls, crawler):
56   - o = cls.from_settings(crawler.settings, crawler.stats)
  63 + o = cls(crawler.settings, crawler.stats)
57 64 crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
58 65 crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
59 66 return o
@@ -65,7 +72,7 @@ def spider_closed(self, spider):
65 72 self.storage.close_spider(spider)
66 73
67 74 def process_request(self, request, spider):
68   - if not self.should_cache_request(request):
  75 + if not self.policy.should_cache_request(request):
69 76 return
70 77 response = self.storage.retrieve_response(spider, request)
71 78
@@ -75,14 +82,14 @@ def process_request(self, request, spider):
75 82 self.stats.inc_value('httpcache/revalidation', spider=spider)
76 83 return
77 84
78   - if response and self.should_cache_response(response):
  85 + if response and self.policy.should_cache_response(response):
79 86 self.stats.inc_value('httpcache/hit', spider=spider)
80   - if self.use_dummy_cache:
81   - response.flags.append('cached')
82   - return response
83   - else:
  87 + if isinstance(self.policy, RFC2616Policy):
84 88 # Response cached and fresh
85 89 raise IgnoreRequest("Ignored request already in cache: %s" % request)
  90 + else:
  91 + response.flags.append('cached')
  92 + return response
86 93
87 94 # Response not cached
88 95 self.stats.inc_value('httpcache/miss', spider=spider)
@@ -90,19 +97,19 @@ def process_request(self, request, spider):
90 97 raise IgnoreRequest("Ignored request not in cache: %s" % request)
91 98
92 99 def process_response(self, request, response, spider):
93   - if (self.should_cache_request(request)
94   - and self.should_cache_response(response)):
95   - if self.use_dummy_cache:
96   - if 'cached' not in response.flags:
97   - self.storage.store_response(spider, request, response)
98   - self.stats.inc_value('httpcache/store', spider=spider)
99   - else:
  100 + if (self.policy.should_cache_request(request)
  101 + and self.policy.should_cache_response(response)):
  102 + if isinstance(self.policy, RFC2616Policy):
100 103 if response.status != 304:
101 104 self.storage.store_response(spider, request, response)
102 105 self.stats.inc_value('httpcache/store', spider=spider)
103 106 else:
104 107 response.flags.append('cached')
105 108 self.stats.inc_value('httpcache/hit', spider=spider)
  109 + else:
  110 + if 'cached' not in response.flags:
  111 + self.storage.store_response(spider, request, response)
  112 + self.stats.inc_value('httpcache/store', spider=spider)
106 113 return response
107 114
108 115
2  scrapy/settings/default_settings.py
@@ -141,7 +141,7 @@
141 141 HTTPCACHE_IGNORE_HTTP_CODES = []
142 142 HTTPCACHE_IGNORE_SCHEMES = ['file']
143 143 HTTPCACHE_DBM_MODULE = 'anydbm'
144   -HTTPCACHE_POLICY = 'dummy'
  144 +HTTPCACHE_POLICY = 'scrapy.contrib.downloadermiddleware.httpcache.DummyPolicy'
145 145
146 146 ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
147 147
47 scrapy/tests/test_downloadermiddleware_httpcache.py
@@ -20,6 +20,9 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
20 20 storage_class = FilesystemCacheStorage
21 21 realcache_storage_class = DbmRealCacheStorage
22 22
  23 + dummy_policy = 'scrapy.contrib.downloadermiddleware.httpcache.DummyPolicy'
  24 + rfc2616_policy = 'scrapy.contrib.downloadermiddleware.httpcache.RFC2616Policy'
  25 +
23 26 yesterday = email.utils.formatdate(time.time() - 1 * 24 * 60 * 60)
24 27 now = email.utils.formatdate()
25 28 tomorrow = email.utils.formatdate(time.time() + 1 * 24 * 60 * 60)
@@ -44,7 +47,7 @@ def _get_settings(self, **new_settings):
44 47 'HTTPCACHE_DIR': self.tmpdir,
45 48 'HTTPCACHE_EXPIRATION_SECS': 1,
46 49 'HTTPCACHE_IGNORE_HTTP_CODES': [],
47   - 'HTTPCACHE_POLICY': 'dummy'
  50 + 'HTTPCACHE_POLICY': self.dummy_policy
48 51 }
49 52 settings.update(new_settings)
50 53 return Settings(settings)
@@ -52,7 +55,7 @@ def _get_settings(self, **new_settings):
52 55 @contextmanager
53 56 def _storage(self, **new_settings):
54 57 settings = self._get_settings(**new_settings)
55   - if settings.get('HTTPCACHE_POLICY') == 'dummy':
  58 + if settings.get('HTTPCACHE_POLICY') == self.dummy_policy:
56 59 storage = self.storage_class(settings)
57 60 else:
58 61 storage = self.realcache_storage_class(settings)
@@ -182,10 +185,10 @@ def test_middleware_ignore_http_codes(self):
182 185 self.assertEqualResponse(self.response, response)
183 186 assert 'cached' in response.flags
184 187
185   - def test_real_http_cache_middleware_response304_not_cached(self):
  188 + def test_middleware_rfc2616policy_response304_not_cached(self):
186 189 # test response is not cached because the status is 304 Not Modified
187 190 # (so it should be cached already)
188   - with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
  191 + with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
189 192 assert mw.process_request(self.request, self.spider) is None
190 193 response = Response('http://www.example.com', status=304)
191 194 mw.process_response(self.request, response, self.spider)
@@ -194,10 +197,10 @@ def test_real_http_cache_middleware_response304_not_cached(self):
194 197 assert mw.storage.retrieve_response(self.spider, self.request) is None
195 198 assert mw.process_request(self.request, self.spider) is None
196 199
197   - def test_real_http_cache_middleware_response_nostore_not_cached(self):
  200 + def test_middleware_rfc2616policy_response_nostore_not_cached(self):
198 201 # test response is not cached because of the Cache-Control 'no-store' directive
199 202 # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
200   - with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
  203 + with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
201 204 assert mw.process_request(self.request, self.spider) is None
202 205 response = Response('http://www.example.com', headers=
203 206 {'Content-Type': 'text/html', 'Cache-Control': 'no-store'},
@@ -207,10 +210,10 @@ def test_real_http_cache_middleware_response_nostore_not_cached(self):
207 210 assert mw.storage.retrieve_response(self.spider, self.request) is None
208 211 assert mw.process_request(self.request, self.spider) is None
209 212
210   - def test_real_http_cache_middleware_request_nostore_not_cached(self):
  213 + def test_middleware_rfc2616policy_request_nostore_not_cached(self):
211 214 # test response is not cached because of the request's Cache-Control 'no-store' directive
212 215 # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
213   - with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
  216 + with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
214 217 request = Request('http://www.example.com',
215 218 headers={'User-Agent': 'test', 'Cache-Control': 'no-store'})
216 219 assert mw.process_request(request, self.spider) is None
@@ -219,16 +222,16 @@ def test_real_http_cache_middleware_request_nostore_not_cached(self):
219 222 assert mw.storage.retrieve_response(self.spider, request) is None
220 223 assert mw.process_request(request, self.spider) is None
221 224
222   - def test_real_http_cache_middleware_response_cached_and_fresh(self):
  225 + def test_middleware_rfc2616policy_response_cached_and_fresh(self):
223 226 # test response cached and fresh
224   - with self._middleware(HTTPCACHE_POLICY='rfc2616') as mw:
  227 + with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy) as mw:
225 228 response = mw.process_response(self.request, self.response, self.spider)
226 229 self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
227 230 assert 'cached' not in response.flags
228 231
229   - def test_real_http_cache_middleware_response_cached_and_stale(self):
  232 + def test_middleware_rfc2616policy_response_cached_and_stale(self):
230 233 # test response cached but stale
231   - with self._middleware(HTTPCACHE_POLICY='rfc2616',
  234 + with self._middleware(HTTPCACHE_POLICY=self.rfc2616_policy,
232 235 HTTPCACHE_STORAGE = 'scrapy.contrib.httpcache.DbmRealCacheStorage') as mw:
233 236 response = Response('http://www.example.com', headers=
234 237 {'Content-Type': 'text/html', 'Cache-Control': 'no-cache'},
@@ -239,10 +242,10 @@ def test_real_http_cache_middleware_response_cached_and_stale(self):
239 242 response = mw.storage.retrieve_response(self.spider, self.request)
240 243 assert isinstance(response, Request)
241 244
242   - def test_real_http_cache_storage_response_cached_and_fresh(self):
  245 + def test_storage_rfc2616policy_response_cached_and_fresh(self):
243 246 # test response is cached and is fresh
244 247 # (response requested should be same as response received)
245   - with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
  248 + with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
246 249 assert storage.retrieve_response(self.spider, self.request) is None
247 250
248 251 response = Response('http://www.example.com', headers=
@@ -252,10 +255,10 @@ def test_real_http_cache_storage_response_cached_and_fresh(self):
252 255 response2 = storage.retrieve_response(self.spider, self.request)
253 256 self.assertEqualResponse(response, response2)
254 257
255   - def test_real_http_cache_storage_response403_cached_and_further_requests_ignored(self):
  258 + def test_storage_rfc2616policy_response403_cached_and_further_requests_ignored(self):
256 259 # test response is cached but further requests are ignored
257 260 # because response status is 403 (as per the RFC)
258   - with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
  261 + with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
259 262 assert storage.retrieve_response(self.spider, self.request) is None
260 263
261 264 response = Response('http://www.example.com', headers=
@@ -265,10 +268,10 @@ def test_real_http_cache_storage_response403_cached_and_further_requests_ignored
265 268 self.assertRaises(IgnoreRequest, storage.retrieve_response,
266 269 self.spider, self.request)
267 270
268   - def test_real_http_cache_storage_response_cached_and_stale(self):
  271 + def test_storage_rfc2616policy_response_cached_and_stale(self):
269 272 # test response is cached and is stale (no cache validators inserted)
270 273 # (request should be same as response received)
271   - with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
  274 + with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
272 275 assert storage.retrieve_response(self.spider, self.request) is None
273 276
274 277 response = Response('http://www.example.com', headers=
@@ -279,9 +282,9 @@ def test_real_http_cache_storage_response_cached_and_stale(self):
279 282 assert isinstance(response2, Request)
280 283 self.assertEqualRequest(self.request, response2)
281 284
282   - def test_real_http_cache_storage_response_cached_and_stale_with_cache_validators(self):
  285 + def test_storage_rfc2616policy_response_cached_and_stale_with_cache_validators(self):
283 286 # test response is cached and is stale and cache validators are inserted
284   - with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
  287 + with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
285 288 assert storage.retrieve_response(self.spider, self.request) is None
286 289
287 290 response = Response('http://www.example.com', headers=
@@ -292,10 +295,10 @@ def test_real_http_cache_storage_response_cached_and_stale_with_cache_validators
292 295 assert isinstance(response2, Request)
293 296 self.assertEqualRequestButWithCacheValidators(self.request, response2)
294 297
295   - def test_real_http_cache_storage_response_cached_and_transparent(self):
  298 + def test_storage_rfc2616policy_response_cached_and_transparent(self):
296 299 # test response is not cached because of the request's Cache-Control 'no-cache' directive
297 300 # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.2
298   - with self._storage(HTTPCACHE_POLICY='rfc2616') as storage:
  301 + with self._storage(HTTPCACHE_POLICY=self.rfc2616_policy) as storage:
299 302 request = Request('http://www.example.com',
300 303 headers={'User-Agent': 'test', 'Cache-Control': 'no-cache'})
301 304 assert storage.retrieve_response(self.spider, request) is None

0 comments on commit cf5f020

Please sign in to comment.
Something went wrong with that request. Please try again.