Skip to content

Commit

Permalink
raise scrapy-specific exception on spider middleware
Browse files Browse the repository at this point in the history
to indicate an invalid value has been returned
  • Loading branch information
elacuesta committed Nov 28, 2016
1 parent caf8373 commit ffbf09f
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 20 deletions.
11 changes: 11 additions & 0 deletions docs/topics/exceptions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ remain disabled. Those components include:

The exception must be raised in the component's ``__init__`` method.

InvalidValue
------------

.. exception:: InvalidValue

This exception can be raised by a downloader or spider middleware to
indicate that some method returned a value not suported by the processing
chain.
See :ref:`topics-spider-middleware` and :ref:`topics-downloader-middleware`
for a list of supported output values.

NotSupported
------------

Expand Down
27 changes: 13 additions & 14 deletions scrapy/core/spidermw.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import six
from twisted.python.failure import Failure
from scrapy.exceptions import InvalidValue
from scrapy.middleware import MiddlewareManager
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.conf import build_component_list
Expand Down Expand Up @@ -38,32 +39,29 @@ def scrape_response(self, scrape_func, response, request, spider):
six.get_method_self(f).__class__.__name__,
six.get_method_function(f).__name__)

re_assertion_error = re.compile("Middleware .*\.process_spider_input must return\s" \
"None or raise an exception, got <type|class '.*'>")

def process_spider_input(response):
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
assert result is None, \
'Middleware %s must return None or ' \
'raise an exception, got %s ' \
% (fname(method), type(result))
if result is not None:
raise InvalidValue('Middleware {} must return None or raise ' \
'an exception, got {}'.format(fname(method), type(result)))
except:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)

def process_spider_exception(_failure):
exception = _failure.value
# ignore AssertionError from middleware's manager
if isinstance(exception, AssertionError) and len(exception.args) > 0 \
and re_assertion_error.search(exception.args[0]):
# ignore InvalidValue exception from middleware's manager
if isinstance(exception, InvalidValue):
return _failure
for method in self.methods['process_spider_exception']:
result = method(response=response, exception=exception, spider=spider)
assert result is None or _isiterable(result), \
'Middleware %s must return None, or an iterable object, got %s ' % \
(fname(method), type(result))
if result is not None and not _isiterable(result):
raise InvalidValue('Middleware {} must return None or an iterable ' \
'object, got {}'.format(fname(method), type(result)))
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
if result is not None:
return result
return _failure
Expand All @@ -84,7 +82,8 @@ def wrapper(result_iterable):
if _isiterable(result):
result = wrapper(result)
else:
raise AssertionError('Middleware %s must return an iterable object, got %s' % (fname(method), type(result)))
raise InvalidValue('Middleware {} must return an iterable object, ' \
'got {}'.format(fname(method), type(result)))
return result

dfd = mustbe_deferred(process_spider_input, response)
Expand Down
5 changes: 5 additions & 0 deletions scrapy/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ class NotConfigured(Exception):
"""Indicates a missing configuration situation"""
pass

class InvalidValue(TypeError):
"""Indicates an invalid value has been returned
by a middleware's processing method"""
pass

# HTTP and crawling

class IgnoreRequest(Exception):
Expand Down
12 changes: 6 additions & 6 deletions tests/test_spidermiddleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def process_spider_exception(self, response, exception, spider):


# ================================================================================
# don't catch AssertionError from scrapy's spider middleware manager
# don't catch InvalidValue from scrapy's spider middleware manager
class InvalidReturnValueFromPreviousMiddlewareInputSpider(Spider):
start_urls = ["http://example.com/"]
name = 'invalid_return_value_from_previous_middleware_input'
Expand Down Expand Up @@ -288,19 +288,19 @@ def test_process_spider_exception_do_something(self):

@defer.inlineCallbacks
def test_process_spider_exception_invalid_return_value_previous_middleware(self):
""" don't catch AssertionError """
""" don't catch InvalidValue from middleware """
# on middleware's input
crawler1 = get_crawler(InvalidReturnValueFromPreviousMiddlewareInputSpider)
with LogCapture() as log1:
yield crawler1.crawl()
self.assertNotIn("AssertionError exception caught", str(log1))
self.assertIn("'spider_exceptions/AssertionError'", str(log1))
self.assertNotIn("InvalidValue exception caught", str(log1))
self.assertIn("'spider_exceptions/InvalidValue'", str(log1))
# on middleware's output
crawler2 = get_crawler(InvalidReturnValueFromPreviousMiddlewareOutputSpider)
with LogCapture() as log2:
yield crawler2.crawl()
self.assertNotIn("AssertionError exception caught", str(log2))
self.assertIn("'spider_exceptions/AssertionError'", str(log2))
self.assertNotIn("InvalidValue exception caught", str(log2))
self.assertIn("'spider_exceptions/InvalidValue'", str(log2))

@defer.inlineCallbacks
def test_process_spider_exception_execution_chain(self):
Expand Down

0 comments on commit ffbf09f

Please sign in to comment.