Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make lazy loading Download Handlers optional #3394

Merged
merged 2 commits into from Dec 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 8 additions & 2 deletions scrapy/core/downloader/handlers/__init__.py
Expand Up @@ -24,6 +24,7 @@ def __init__(self, crawler):
crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
for scheme, clspath in six.iteritems(handlers):
self._schemes[scheme] = clspath
self._load_handler(scheme, skip_lazy=True)

crawler.signals.connect(self._close, signals.engine_stopped)

Expand All @@ -39,22 +40,27 @@ def _get_handler(self, scheme):
self._notconfigured[scheme] = 'no handler available for that scheme'
return None

return self._load_handler(scheme)

def _load_handler(self, scheme, skip_lazy=False):
path = self._schemes[scheme]
try:
dhcls = load_object(path)
if skip_lazy and getattr(dhcls, 'lazy', True):
return None
dh = dhcls(self._crawler.settings)
except NotConfigured as ex:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True, extra={'crawler': self._crawler})
exc_info=True, extra={'crawler': self._crawler})
self._notconfigured[scheme] = str(ex)
return None
else:
self._handlers[scheme] = dh
return self._handlers[scheme]
return dh

def download_request(self, request, spider):
scheme = urlparse_cached(request).scheme
Expand Down
2 changes: 2 additions & 0 deletions scrapy/core/downloader/handlers/datauri.py
Expand Up @@ -6,6 +6,8 @@


class DataURIDownloadHandler(object):
lazy = False

def __init__(self, settings):
super(DataURIDownloadHandler, self).__init__()

Expand Down
2 changes: 2 additions & 0 deletions scrapy/core/downloader/handlers/file.py
Expand Up @@ -2,7 +2,9 @@
from scrapy.responsetypes import responsetypes
from scrapy.utils.decorators import defers


class FileDownloadHandler(object):
lazy = False

def __init__(self, settings):
pass
Expand Down
3 changes: 3 additions & 0 deletions scrapy/core/downloader/handlers/ftp.py
Expand Up @@ -60,7 +60,10 @@ def close(self):
self.body.close() if self.filename else self.body.seek(0)

_CODE_RE = re.compile("\d+")


class FTPDownloadHandler(object):
lazy = False

CODE_MAPPING = {
"550": 404,
Expand Down
1 change: 1 addition & 0 deletions scrapy/core/downloader/handlers/http10.py
Expand Up @@ -6,6 +6,7 @@


class HTTP10DownloadHandler(object):
lazy = False

def __init__(self, settings):
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
Expand Down
1 change: 1 addition & 0 deletions scrapy/core/downloader/handlers/http11.py
Expand Up @@ -33,6 +33,7 @@


class HTTP11DownloadHandler(object):
lazy = False

def __init__(self, settings):
self._pool = HTTPConnectionPool(reactor, persistent=True)
Expand Down
27 changes: 22 additions & 5 deletions tests/test_downloader_handlers.py
Expand Up @@ -41,13 +41,23 @@
from tests.mockserver import MockServer, ssl_context_factory, Echo
from tests.spiders import SingleRequestSpider


class DummyDH(object):
lazy = False

def __init__(self, crawler):
pass


class DummyLazyDH(object):
# Default is lazy for backwards compatibility

def __init__(self, crawler):
pass


class OffDH(object):
lazy = False

def __init__(self, crawler):
raise NotConfigured
Expand All @@ -60,8 +70,6 @@ def test_enabled_handler(self):
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertIn('scheme', dh._schemes)
for scheme in handlers: # force load handlers
dh._get_handler(scheme)
self.assertIn('scheme', dh._handlers)
self.assertNotIn('scheme', dh._notconfigured)

Expand All @@ -70,8 +78,6 @@ def test_not_configured_handler(self):
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertIn('scheme', dh._schemes)
for scheme in handlers: # force load handlers
dh._get_handler(scheme)
self.assertNotIn('scheme', dh._handlers)
self.assertIn('scheme', dh._notconfigured)

Expand All @@ -80,11 +86,22 @@ def test_disabled_handler(self):
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertNotIn('scheme', dh._schemes)
for scheme in handlers: # force load handlers
for scheme in handlers: # force load handlers
dh._get_handler(scheme)
self.assertNotIn('scheme', dh._handlers)
self.assertIn('scheme', dh._notconfigured)

def test_lazy_handlers(self):
handlers = {'scheme': 'tests.test_downloader_handlers.DummyLazyDH'}
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertIn('scheme', dh._schemes)
self.assertNotIn('scheme', dh._handlers)
for scheme in handlers: # force load lazy handler
dh._get_handler(scheme)
self.assertIn('scheme', dh._handlers)
self.assertNotIn('scheme', dh._notconfigured)


class FileTestCase(unittest.TestCase):

Expand Down