Permalink
Browse files

improve mac os compatibility

Highlights:
* FifoDiskQueue: mixing buffered version of seek with unbuffered version
  of read causes problems
* BSD's find does not default to current directory
* gdbm needs to be closed before it can reopen the same file
* skip PIL tests if jpeg support is not available
  • Loading branch information...
1 parent b9a9614 commit fc405e98aa7cac7bb21a6234d80164573c0e420d @alexcepoi alexcepoi committed Dec 1, 2012
View
@@ -35,7 +35,7 @@ vsftpd_log_file=/dev/null
vsftpd_pid=$!
fi
-find -name '*.py[co]' -delete
+find . -name '*.py[co]' -delete
if [ $# -eq 0 ]; then
$trial --reporter=text scrapy scrapyd
else
@@ -1,11 +1,16 @@
-import unittest, tempfile, shutil, time
+import time
+import tempfile
+import shutil
+import unittest
+from contextlib import contextmanager
from scrapy.http import Response, HtmlResponse, Request
from scrapy.spider import BaseSpider
-from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware
from scrapy.settings import Settings
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.test import get_crawler
+from scrapy.contrib.downloadermiddleware.httpcache import \
+ FilesystemCacheStorage, HttpCacheMiddleware
class HttpCacheMiddlewareTest(unittest.TestCase):
@@ -16,8 +21,10 @@ def setUp(self):
self.crawler = get_crawler()
self.spider = BaseSpider('example.com')
self.tmpdir = tempfile.mkdtemp()
- self.request = Request('http://www.example.com', headers={'User-Agent': 'test'})
- self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202)
+ self.request = Request('http://www.example.com',
+ headers={'User-Agent': 'test'})
+ self.response = Response('http://www.example.com', headers=
+ {'Content-Type': 'text/html'}, body='test body', status=202)
self.crawler.stats.open_spider(self.spider)
def tearDown(self):
@@ -34,113 +41,135 @@ def _get_settings(self, **new_settings):
settings.update(new_settings)
return Settings(settings)
- def _get_storage(self, **new_settings):
- return self.storage_class(self._get_settings(**new_settings))
-
- def _get_middleware(self, **new_settings):
- mw = HttpCacheMiddleware(self._get_settings(**new_settings), self.crawler.stats)
+ @contextmanager
+ def _storage(self, **new_settings):
+ settings = self._get_settings(**new_settings)
+ storage = self.storage_class(settings)
+ storage.open_spider(self.spider)
+ try:
+ yield storage
+ finally:
+ storage.close_spider(self.spider)
+
+ @contextmanager
+ def _middleware(self, **new_settings):
+ settings = self._get_settings(**new_settings)
+ mw = HttpCacheMiddleware(settings, self.crawler.stats)
mw.spider_opened(self.spider)
- return mw
+ try:
+ yield mw
+ finally:
+ mw.spider_closed(self.spider)
def test_storage(self):
- storage = self._get_storage()
- request2 = self.request.copy()
- assert storage.retrieve_response(self.spider, request2) is None
- storage.store_response(self.spider, self.request, self.response)
- response2 = storage.retrieve_response(self.spider, request2)
- assert isinstance(response2, HtmlResponse) # inferred from content-type header
- self.assertEqualResponse(self.response, response2)
- time.sleep(2) # wait for cache to expire
- assert storage.retrieve_response(self.spider, request2) is None
+ with self._storage() as storage:
+ request2 = self.request.copy()
+ assert storage.retrieve_response(self.spider, request2) is None
+
+ storage.store_response(self.spider, self.request, self.response)
+ response2 = storage.retrieve_response(self.spider, request2)
+ assert isinstance(response2, HtmlResponse) # content-type header
+ self.assertEqualResponse(self.response, response2)
+
+ time.sleep(2) # wait for cache to expire
+ assert storage.retrieve_response(self.spider, request2) is None
def test_storage_never_expire(self):
- storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
- assert storage.retrieve_response(self.spider, self.request) is None
- storage.store_response(self.spider, self.request, self.response)
- time.sleep(0.5) # give the chance to expire
- assert storage.retrieve_response(self.spider, self.request)
+ with self._storage(HTTPCACHE_EXPIRATION_SECS=0) as storage:
+ assert storage.retrieve_response(self.spider, self.request) is None
+ storage.store_response(self.spider, self.request, self.response)
+ time.sleep(0.5) # give the chance to expire
+ assert storage.retrieve_response(self.spider, self.request)
def test_middleware(self):
- mw = self._get_middleware()
- assert mw.process_request(self.request, self.spider) is None
- mw.process_response(self.request, self.response, self.spider)
- response = mw.process_request(self.request, self.spider)
- assert isinstance(response, HtmlResponse)
- self.assertEqualResponse(self.response, response)
- assert 'cached' in response.flags
+ with self._middleware() as mw:
+ assert mw.process_request(self.request, self.spider) is None
+ mw.process_response(self.request, self.response, self.spider)
+
+ response = mw.process_request(self.request, self.spider)
+ assert isinstance(response, HtmlResponse)
+ self.assertEqualResponse(self.response, response)
+ assert 'cached' in response.flags
def test_different_request_response_urls(self):
- mw = self._get_middleware()
- req = Request('http://host.com/path')
- res = Response('http://host2.net/test.html')
- assert mw.process_request(req, self.spider) is None
- mw.process_response(req, res, self.spider)
- cached = mw.process_request(req, self.spider)
- assert isinstance(cached, Response)
- self.assertEqualResponse(res, cached)
- assert 'cached' in cached.flags
+ with self._middleware() as mw:
+ req = Request('http://host.com/path')
+ res = Response('http://host2.net/test.html')
+
+ assert mw.process_request(req, self.spider) is None
+ mw.process_response(req, res, self.spider)
+
+ cached = mw.process_request(req, self.spider)
+ assert isinstance(cached, Response)
+ self.assertEqualResponse(res, cached)
+ assert 'cached' in cached.flags
def test_middleware_ignore_missing(self):
- mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
- self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
- mw.process_response(self.request, self.response, self.spider)
- response = mw.process_request(self.request, self.spider)
- assert isinstance(response, HtmlResponse)
- self.assertEqualResponse(self.response, response)
- assert 'cached' in response.flags
+ with self._middleware(HTTPCACHE_IGNORE_MISSING=True) as mw:
+ self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
+ mw.process_response(self.request, self.response, self.spider)
+ response = mw.process_request(self.request, self.spider)
+ assert isinstance(response, HtmlResponse)
+ self.assertEqualResponse(self.response, response)
+ assert 'cached' in response.flags
def test_middleware_ignore_schemes(self):
# http responses are cached by default
req, res = Request('http://test.com/'), Response('http://test.com/')
- mw = self._get_middleware()
- assert mw.process_request(req, self.spider) is None
- mw.process_response(req, res, self.spider)
- cached = mw.process_request(req, self.spider)
- assert isinstance(cached, Response), type(cached)
- self.assertEqualResponse(res, cached)
- assert 'cached' in cached.flags
+ with self._middleware() as mw:
+ assert mw.process_request(req, self.spider) is None
+ mw.process_response(req, res, self.spider)
+
+ cached = mw.process_request(req, self.spider)
+ assert isinstance(cached, Response), type(cached)
+ self.assertEqualResponse(res, cached)
+ assert 'cached' in cached.flags
# file response is not cached by default
req, res = Request('file:///tmp/t.txt'), Response('file:///tmp/t.txt')
- mw = self._get_middleware()
- assert mw.process_request(req, self.spider) is None
- mw.process_response(req, res, self.spider)
- assert mw.storage.retrieve_response(self.spider, req) is None
- assert mw.process_request(req, self.spider) is None
+ with self._middleware() as mw:
+ assert mw.process_request(req, self.spider) is None
+ mw.process_response(req, res, self.spider)
+
+ assert mw.storage.retrieve_response(self.spider, req) is None
+ assert mw.process_request(req, self.spider) is None
# s3 scheme response is cached by default
req, res = Request('s3://bucket/key'), Response('http://bucket/key')
- mw = self._get_middleware()
- assert mw.process_request(req, self.spider) is None
- mw.process_response(req, res, self.spider)
- cached = mw.process_request(req, self.spider)
- assert isinstance(cached, Response), type(cached)
- self.assertEqualResponse(res, cached)
- assert 'cached' in cached.flags
+ with self._middleware() as mw:
+ assert mw.process_request(req, self.spider) is None
+ mw.process_response(req, res, self.spider)
+
+ cached = mw.process_request(req, self.spider)
+ assert isinstance(cached, Response), type(cached)
+ self.assertEqualResponse(res, cached)
+ assert 'cached' in cached.flags
# ignore s3 scheme
req, res = Request('s3://bucket/key2'), Response('http://bucket/key2')
- mw = self._get_middleware(HTTPCACHE_IGNORE_SCHEMES=['s3'])
- assert mw.process_request(req, self.spider) is None
- mw.process_response(req, res, self.spider)
- assert mw.storage.retrieve_response(self.spider, req) is None
- assert mw.process_request(req, self.spider) is None
+ with self._middleware(HTTPCACHE_IGNORE_SCHEMES=['s3']) as mw:
+ assert mw.process_request(req, self.spider) is None
+ mw.process_response(req, res, self.spider)
+
+ assert mw.storage.retrieve_response(self.spider, req) is None
+ assert mw.process_request(req, self.spider) is None
def test_middleware_ignore_http_codes(self):
# test response is not cached
- mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
- assert mw.process_request(self.request, self.spider) is None
- mw.process_response(self.request, self.response, self.spider)
- assert mw.storage.retrieve_response(self.spider, self.request) is None
- assert mw.process_request(self.request, self.spider) is None
+ with self._middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202]) as mw:
+ assert mw.process_request(self.request, self.spider) is None
+ mw.process_response(self.request, self.response, self.spider)
+
+ assert mw.storage.retrieve_response(self.spider, self.request) is None
+ assert mw.process_request(self.request, self.spider) is None
# test response is cached
- mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
- mw.process_response(self.request, self.response, self.spider)
- response = mw.process_request(self.request, self.spider)
- assert isinstance(response, HtmlResponse)
- self.assertEqualResponse(self.response, response)
- assert 'cached' in response.flags
+ with self._middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203]) as mw:
+ mw.process_response(self.request, self.response, self.spider)
+ response = mw.process_request(self.request, self.spider)
+ assert isinstance(response, HtmlResponse)
+ self.assertEqualResponse(self.response, response)
+ assert 'cached' in response.flags
def assertEqualResponse(self, response1, response2):
self.assertEqual(response1.url, response2.url)
@@ -150,4 +179,3 @@ def assertEqualResponse(self, response1, response2):
if __name__ == '__main__':
unittest.main()
-
@@ -8,9 +8,11 @@
try:
from PIL import Image
- skip = False
except ImportError, e:
skip = True
+else:
+ encoders = set(('jpeg_encoder', 'jpeg_decoder'))
+ skip = not encoders.issubset(set(Image.core.__dict__))
def _mocked_download_func(request, info):
response = request.meta.get('response')
View
@@ -45,7 +45,7 @@ def __init__(self, path, chunksize=100000):
self.chunksize = self.info['chunksize']
self.headf = self._openchunk(self.info['head'][0], 'ab+')
self.tailf = self._openchunk(self.info['tail'][0])
- self.tailf.seek(self.info['tail'][2])
+ os.lseek(self.tailf.fileno(), self.info['tail'][2], os.SEEK_SET)
def push(self, string):
hnum, hpos = self.info['head']

0 comments on commit fc405e9

Please sign in to comment.