-
Notifications
You must be signed in to change notification settings - Fork 10.4k
/
crawler.py
98 lines (78 loc) · 3.26 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import signal
from twisted.internet import reactor, defer
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core.engine import ExecutionEngine
from scrapy.extension import ExtensionManager
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
from scrapy.utils.misc import load_object
from scrapy import log, signals
class Crawler(object):
def __init__(self, settings):
self.configured = False
self.settings = settings
def install(self):
import scrapy.project
assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
scrapy.project.crawler = self
def uninstall(self):
import scrapy.project
assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
del scrapy.project.crawler
def configure(self):
if self.configured:
return
self.configured = True
self.extensions = ExtensionManager.from_settings(self.settings)
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
self.spiders = spman_cls.from_settings(self.settings)
self.engine = ExecutionEngine(self.settings, self._spider_closed)
def crawl(self, spider, requests=None):
spider.set_crawler(self)
if requests is None:
requests = spider.start_requests()
return self.engine.open_spider(spider, requests)
def _spider_closed(self, spider=None):
if not self.engine.open_spiders:
self.stop()
@defer.inlineCallbacks
def start(self):
yield defer.maybeDeferred(self.configure)
yield defer.maybeDeferred(self.engine.start)
@defer.inlineCallbacks
def stop(self):
if self.engine.running:
yield defer.maybeDeferred(self.engine.stop)
class CrawlerProcess(Crawler):
"""A class to run a single Scrapy crawler in a process. It provides
automatic control of the Twisted reactor and installs some convenient
signals for shutting down the crawl.
"""
def __init__(self, *a, **kw):
super(CrawlerProcess, self).__init__(*a, **kw)
dispatcher.connect(self.stop, signals.engine_stopped)
install_shutdown_handlers(self._signal_shutdown)
def start(self):
super(CrawlerProcess, self).start()
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
reactor.run(installSignalHandlers=False) # blocking call
def stop(self):
d = super(CrawlerProcess, self).stop()
d.addBoth(self._stop_reactor)
return d
def _stop_reactor(self, _=None):
try:
reactor.stop()
except RuntimeError: # raised if already stopped or in shutdown stage
pass
def _signal_shutdown(self, signum, _):
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
log.msg("Received %s, shutting down gracefully. Send again to force " \
"unclean shutdown" % signame, level=log.INFO)
reactor.callFromThread(self.stop)
def _signal_kill(self, signum, _):
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
log.msg('Received %s twice, forcing unclean shutdown' % signame, \
level=log.INFO)
reactor.callFromThread(self._stop_reactor)