Skip to content

Commit

Permalink
improve mac os compatibility
Browse files Browse the repository at this point in the history
Highlights:
* FifoDiskQueue: mixing buffered version of seek with unbuffered version
  of read causes problems
* BSD's find does not default to current directory
* gdbm needs to be closed before it can reopen the same file
* skip PIL tests if jpeg support is not available
  • Loading branch information
alexcepoi committed Dec 1, 2012
1 parent b9a9614 commit fc405e9
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 86 deletions.
2 changes: 1 addition & 1 deletion bin/runtests.sh
Expand Up @@ -35,7 +35,7 @@ vsftpd_log_file=/dev/null
vsftpd_pid=$!
fi

find -name '*.py[co]' -delete
find . -name '*.py[co]' -delete
if [ $# -eq 0 ]; then
$trial --reporter=text scrapy scrapyd
else
Expand Down
194 changes: 111 additions & 83 deletions scrapy/tests/test_downloadermiddleware_httpcache.py
@@ -1,11 +1,16 @@
import unittest, tempfile, shutil, time
import time
import tempfile
import shutil
import unittest
from contextlib import contextmanager

from scrapy.http import Response, HtmlResponse, Request
from scrapy.spider import BaseSpider
from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware
from scrapy.settings import Settings
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.test import get_crawler
from scrapy.contrib.downloadermiddleware.httpcache import \
FilesystemCacheStorage, HttpCacheMiddleware


class HttpCacheMiddlewareTest(unittest.TestCase):
Expand All @@ -16,8 +21,10 @@ def setUp(self):
self.crawler = get_crawler()
self.spider = BaseSpider('example.com')
self.tmpdir = tempfile.mkdtemp()
self.request = Request('http://www.example.com', headers={'User-Agent': 'test'})
self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202)
self.request = Request('http://www.example.com',
headers={'User-Agent': 'test'})
self.response = Response('http://www.example.com', headers=
{'Content-Type': 'text/html'}, body='test body', status=202)
self.crawler.stats.open_spider(self.spider)

def tearDown(self):
Expand All @@ -34,113 +41,135 @@ def _get_settings(self, **new_settings):
settings.update(new_settings)
return Settings(settings)

def _get_storage(self, **new_settings):
return self.storage_class(self._get_settings(**new_settings))

def _get_middleware(self, **new_settings):
mw = HttpCacheMiddleware(self._get_settings(**new_settings), self.crawler.stats)
@contextmanager
def _storage(self, **new_settings):
settings = self._get_settings(**new_settings)
storage = self.storage_class(settings)
storage.open_spider(self.spider)
try:
yield storage
finally:
storage.close_spider(self.spider)

@contextmanager
def _middleware(self, **new_settings):
settings = self._get_settings(**new_settings)
mw = HttpCacheMiddleware(settings, self.crawler.stats)
mw.spider_opened(self.spider)
return mw
try:
yield mw
finally:
mw.spider_closed(self.spider)

def test_storage(self):
storage = self._get_storage()
request2 = self.request.copy()
assert storage.retrieve_response(self.spider, request2) is None
storage.store_response(self.spider, self.request, self.response)
response2 = storage.retrieve_response(self.spider, request2)
assert isinstance(response2, HtmlResponse) # inferred from content-type header
self.assertEqualResponse(self.response, response2)
time.sleep(2) # wait for cache to expire
assert storage.retrieve_response(self.spider, request2) is None
with self._storage() as storage:
request2 = self.request.copy()
assert storage.retrieve_response(self.spider, request2) is None

storage.store_response(self.spider, self.request, self.response)
response2 = storage.retrieve_response(self.spider, request2)
assert isinstance(response2, HtmlResponse) # content-type header
self.assertEqualResponse(self.response, response2)

time.sleep(2) # wait for cache to expire
assert storage.retrieve_response(self.spider, request2) is None

def test_storage_never_expire(self):
storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
assert storage.retrieve_response(self.spider, self.request) is None
storage.store_response(self.spider, self.request, self.response)
time.sleep(0.5) # give the chance to expire
assert storage.retrieve_response(self.spider, self.request)
with self._storage(HTTPCACHE_EXPIRATION_SECS=0) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
storage.store_response(self.spider, self.request, self.response)
time.sleep(0.5) # give the chance to expire
assert storage.retrieve_response(self.spider, self.request)

def test_middleware(self):
mw = self._get_middleware()
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
with self._middleware() as mw:
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)

response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags

def test_different_request_response_urls(self):
mw = self._get_middleware()
req = Request('http://host.com/path')
res = Response('http://host2.net/test.html')
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
with self._middleware() as mw:
req = Request('http://host.com/path')
res = Response('http://host2.net/test.html')

assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)

cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags

def test_middleware_ignore_missing(self):
mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
with self._middleware(HTTPCACHE_IGNORE_MISSING=True) as mw:
self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags

def test_middleware_ignore_schemes(self):
# http responses are cached by default
req, res = Request('http://test.com/'), Response('http://test.com/')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
with self._middleware() as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)

cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags

# file response is not cached by default
req, res = Request('file:///tmp/t.txt'), Response('file:///tmp/t.txt')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
with self._middleware() as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)

assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None

# s3 scheme response is cached by default
req, res = Request('s3://bucket/key'), Response('http://bucket/key')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
with self._middleware() as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)

cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags

# ignore s3 scheme
req, res = Request('s3://bucket/key2'), Response('http://bucket/key2')
mw = self._get_middleware(HTTPCACHE_IGNORE_SCHEMES=['s3'])
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
with self._middleware(HTTPCACHE_IGNORE_SCHEMES=['s3']) as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)

assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None

def test_middleware_ignore_http_codes(self):
# test response is not cached
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
with self._middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202]) as mw:
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)

assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None

# test response is cached
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
with self._middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203]) as mw:
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags

def assertEqualResponse(self, response1, response2):
self.assertEqual(response1.url, response2.url)
Expand All @@ -150,4 +179,3 @@ def assertEqualResponse(self, response1, response2):

if __name__ == '__main__':
unittest.main()

4 changes: 3 additions & 1 deletion scrapy/tests/test_pipeline_images.py
Expand Up @@ -8,9 +8,11 @@

try:
from PIL import Image
skip = False
except ImportError, e:
skip = True
else:
encoders = set(('jpeg_encoder', 'jpeg_decoder'))
skip = not encoders.issubset(set(Image.core.__dict__))

def _mocked_download_func(request, info):
response = request.meta.get('response')
Expand Down
2 changes: 1 addition & 1 deletion scrapy/utils/queue.py
Expand Up @@ -45,7 +45,7 @@ def __init__(self, path, chunksize=100000):
self.chunksize = self.info['chunksize']
self.headf = self._openchunk(self.info['head'][0], 'ab+')
self.tailf = self._openchunk(self.info['tail'][0])
self.tailf.seek(self.info['tail'][2])
os.lseek(self.tailf.fileno(), self.info['tail'][2], os.SEEK_SET)

def push(self, string):
hnum, hpos = self.info['head']
Expand Down

0 comments on commit fc405e9

Please sign in to comment.