-
Notifications
You must be signed in to change notification settings - Fork 10.4k
/
spidermw.py
119 lines (100 loc) · 5.4 KB
/
spidermw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# pylint: disable=bad-continuation,bad-whitespace,broad-except,invalid-name,line-too-long,missing-docstring,no-else-return
"""
Spider Middleware manager
See documentation in docs/topics/spider-middleware.rst
"""
from itertools import chain, islice
import six
from twisted.python.failure import Failure
from scrapy.exceptions import _InvalidOutput
from scrapy.middleware import MiddlewareManager
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.conf import build_component_list
from scrapy.utils.python import MutableChain
def _isiterable(possible_iterator):
return hasattr(possible_iterator, '__iter__')
class SpiderMiddlewareManager(MiddlewareManager):
component_name = 'spider middleware'
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
def _add_middleware(self, mw):
super(SpiderMiddlewareManager, self)._add_middleware(mw)
if hasattr(mw, 'process_spider_input'):
self.methods['process_spider_input'].append(mw.process_spider_input)
if hasattr(mw, 'process_start_requests'):
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
self.methods['process_spider_output'].appendleft(getattr(mw, 'process_spider_output', None))
self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))
def scrape_response(self, scrape_func, response, request, spider):
fname = lambda f:'%s.%s' % (
six.get_method_self(f).__class__.__name__,
six.get_method_function(f).__name__)
def process_spider_input(response):
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
if result is not None:
raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
.format(fname(method), type(result)))
except _InvalidOutput:
raise
except Exception:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def process_spider_exception(_failure, start_index=0):
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
result = method(response=response, exception=exception, spider=spider)
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
return process_spider_output(result, method_index+1)
elif result is None:
continue
else:
raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
.format(fname(method), type(result)))
return _failure
def process_spider_output(result, start_index=0):
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
def evaluate_iterable(iterable, index):
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), index+1)
if isinstance(exception_result, Failure):
raise
recovered.extend(exception_result)
method_list = islice(self.methods['process_spider_output'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
# the following might fail directly if the output value is not a generator
try:
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), method_index+1)
if isinstance(exception_result, Failure):
raise
return exception_result
if _isiterable(result):
result = evaluate_iterable(result, method_index)
else:
raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
.format(fname(method), type(result)))
return chain(result, recovered)
dfd = mustbe_deferred(process_spider_input, response)
dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception)
return dfd
def process_start_requests(self, start_requests, spider):
return self._process_chain('process_start_requests', start_requests, spider)