Skip to content

Commit

Permalink
Merge pull request #1528 from scrapy/create-crawler
Browse files Browse the repository at this point in the history
public Crawler.create_crawler method
  • Loading branch information
curita committed Oct 30, 2015
2 parents 9424ca0 + 0000b6e commit 57f87b9
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 25 deletions.
19 changes: 16 additions & 3 deletions scrapy/crawler.py
Expand Up @@ -148,9 +148,7 @@ def crawl(self, crawler_or_spidercls, *args, **kwargs):
:param dict kwargs: keyword arguments to initialize the spider
"""
crawler = crawler_or_spidercls
if not isinstance(crawler_or_spidercls, Crawler):
crawler = self._create_crawler(crawler_or_spidercls)
crawler = self.create_crawler(crawler_or_spidercls)
return self._crawl(crawler, *args, **kwargs)

def _crawl(self, crawler, *args, **kwargs):
Expand All @@ -165,6 +163,21 @@ def _done(result):

return d.addBoth(_done)

def create_crawler(self, crawler_or_spidercls):
"""
Return a :class:`~scrapy.crawler.Crawler` object.
* If `crawler_or_spidercls` is a Crawler, it is returned as-is.
* If `crawler_or_spidercls` is a Spider subclass, a new Crawler
is constructed for it.
* If `crawler_or_spidercls` is a string, this function finds
a spider with this name in a Scrapy project (using spider loader),
then creates a Crawler instance for it.
"""
if isinstance(crawler_or_spidercls, Crawler):
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls)

def _create_crawler(self, spidercls):
if isinstance(spidercls, six.string_types):
spidercls = self.spider_loader.load(spidercls)
Expand Down
5 changes: 2 additions & 3 deletions scrapy/utils/test.py
Expand Up @@ -26,11 +26,10 @@ def get_crawler(spidercls=None, settings_dict=None):
priority.
"""
from scrapy.crawler import CrawlerRunner
from scrapy.settings import Settings
from scrapy.spiders import Spider

runner = CrawlerRunner(Settings(settings_dict))
return runner._create_crawler(spidercls or Spider)
runner = CrawlerRunner(settings_dict)
return runner.create_crawler(spidercls or Spider)

def get_pythonpath():
"""Return a PYTHONPATH suitable to use in processes so that they find this
Expand Down
57 changes: 38 additions & 19 deletions tests/test_crawl.py
Expand Up @@ -6,26 +6,27 @@
from twisted.internet import defer
from twisted.trial.unittest import TestCase

from scrapy.utils.test import get_crawler
from scrapy.http import Request
from scrapy.crawler import CrawlerRunner
from tests import mock
from tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider
from tests.mockserver import MockServer
from scrapy.http import Request


class CrawlTestCase(TestCase):

def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
self.runner = CrawlerRunner()

def tearDown(self):
self.mockserver.__exit__(None, None, None)

@defer.inlineCallbacks
def test_follow_all(self):
crawler = get_crawler(FollowAllSpider)
crawler = self.runner.create_crawler(FollowAllSpider)
yield crawler.crawl()
self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url

Expand All @@ -41,7 +42,7 @@ def test_delay(self):
@defer.inlineCallbacks
def _test_delay(self, delay, randomize):
settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
crawler = get_crawler(FollowAllSpider, settings)
crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
yield crawler.crawl(maxlatency=delay * 2)
t = crawler.spider.times
totaltime = t[-1] - t[0]
Expand All @@ -52,15 +53,15 @@ def _test_delay(self, delay, randomize):

@defer.inlineCallbacks
def test_timeout_success(self):
crawler = get_crawler(DelaySpider)
crawler = self.runner.create_crawler(DelaySpider)
yield crawler.crawl(n=0.5)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 > 0)
self.assertTrue(crawler.spider.t2 > crawler.spider.t1)

@defer.inlineCallbacks
def test_timeout_failure(self):
crawler = get_crawler(DelaySpider, {"DOWNLOAD_TIMEOUT": 0.35})
crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider)
yield crawler.crawl(n=0.5)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 == 0)
Expand All @@ -75,14 +76,14 @@ def test_timeout_failure(self):

@defer.inlineCallbacks
def test_retry_503(self):
crawler = get_crawler(SimpleSpider)
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("http://localhost:8998/status?n=503")
self._assert_retried(l)

@defer.inlineCallbacks
def test_retry_conn_failed(self):
crawler = get_crawler(SimpleSpider)
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("http://localhost:65432/status?n=503")
self._assert_retried(l)
Expand All @@ -91,15 +92,15 @@ def test_retry_conn_failed(self):
def test_retry_dns_error(self):
with mock.patch('socket.gethostbyname',
side_effect=socket.gaierror(-5, 'No address associated with hostname')):
crawler = get_crawler(SimpleSpider)
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("http://example.com/")
self._assert_retried(l)

@defer.inlineCallbacks
def test_start_requests_bug_before_yield(self):
with LogCapture('scrapy', level=logging.ERROR) as l:
crawler = get_crawler(BrokenStartRequestsSpider)
crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_before_yield=1)

self.assertEqual(len(l.records), 1)
Expand All @@ -110,7 +111,7 @@ def test_start_requests_bug_before_yield(self):
@defer.inlineCallbacks
def test_start_requests_bug_yielding(self):
with LogCapture('scrapy', level=logging.ERROR) as l:
crawler = get_crawler(BrokenStartRequestsSpider)
crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_yielding=1)

self.assertEqual(len(l.records), 1)
Expand All @@ -121,7 +122,7 @@ def test_start_requests_bug_yielding(self):
@defer.inlineCallbacks
def test_start_requests_lazyness(self):
settings = {"CONCURRENT_REQUESTS": 1}
crawler = get_crawler(BrokenStartRequestsSpider, settings)
crawler = CrawlerRunner(settings).create_crawler(BrokenStartRequestsSpider)
yield crawler.crawl()
#self.assertTrue(False, crawler.spider.seedsseen)
#self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99),
Expand All @@ -130,7 +131,7 @@ def test_start_requests_lazyness(self):
@defer.inlineCallbacks
def test_start_requests_dupes(self):
settings = {"CONCURRENT_REQUESTS": 1}
crawler = get_crawler(DuplicateStartRequestsSpider, settings)
crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3)
self.assertEqual(crawler.spider.visited, 6)

Expand Down Expand Up @@ -159,23 +160,23 @@ def test_unbounded_response(self):
foo body
with multiples lines
'''})
crawler = get_crawler(SimpleSpider)
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("http://localhost:8998/raw?{0}".format(query))
self.assertEqual(str(l).count("Got response 200"), 1)

@defer.inlineCallbacks
def test_retry_conn_lost(self):
# connection lost after receiving data
crawler = get_crawler(SimpleSpider)
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("http://localhost:8998/drop?abort=0")
self._assert_retried(l)

@defer.inlineCallbacks
def test_retry_conn_aborted(self):
# connection lost before receiving data
crawler = get_crawler(SimpleSpider)
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("http://localhost:8998/drop?abort=1")
self._assert_retried(l)
Expand All @@ -194,7 +195,7 @@ def test_referer_header(self):
req0.meta['next'] = req1
req1.meta['next'] = req2
req2.meta['next'] = req3
crawler = get_crawler(SingleRequestSpider)
crawler = self.runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=req0)
# basic asserts in case of weird communication errors
self.assertIn('responses', crawler.spider.meta)
Expand All @@ -220,7 +221,7 @@ def test_engine_status(self):
def cb(response):
est.append(get_engine_status(crawler.engine))

crawler = get_crawler(SingleRequestSpider)
crawler = self.runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed='http://localhost:8998/', callback_func=cb)
self.assertEqual(len(est), 1, est)
s = dict(est[0])
Expand All @@ -244,6 +245,24 @@ class FaultySpider(SimpleSpider):
def start_requests(self):
raise TestError

crawler = get_crawler(FaultySpider)
crawler = self.runner.create_crawler(FaultySpider)
yield self.assertFailure(crawler.crawl(), TestError)
self.assertFalse(crawler.crawling)

@defer.inlineCallbacks
def test_crawlerrunner_accepts_crawler(self):
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as log:
yield self.runner.crawl(crawler, "http://localhost:8998/status?n=200")
self.assertIn("Got response 200", str(log))

@defer.inlineCallbacks
def test_crawl_multiple(self):
self.runner.crawl(SimpleSpider, "http://localhost:8998/status?n=200")
self.runner.crawl(SimpleSpider, "http://localhost:8998/status?n=503")

with LogCapture() as log:
yield self.runner.join()

self._assert_retried(log)
self.assertIn("Got response 200", str(log))
13 changes: 13 additions & 0 deletions tests/test_spiderloader/__init__.py
Expand Up @@ -8,10 +8,12 @@

# ugly hack to avoid cyclic imports of scrapy.spiders when running this test
# alone
import scrapy
from scrapy.interfaces import ISpiderLoader
from scrapy.spiderloader import SpiderLoader
from scrapy.settings import Settings
from scrapy.http import Request
from scrapy.crawler import CrawlerRunner

module_dir = os.path.dirname(os.path.abspath(__file__))

Expand Down Expand Up @@ -76,3 +78,14 @@ def test_load_base_spider(self):
settings = Settings({'SPIDER_MODULES': [module]})
self.spider_loader = SpiderLoader.from_settings(settings)
assert len(self.spider_loader._spiders) == 0

def test_crawler_runner_loading(self):
module = 'tests.test_spiderloader.test_spiders.spider1'
runner = CrawlerRunner({'SPIDER_MODULES': [module]})

self.assertRaisesRegexp(KeyError, 'Spider not found',
runner.create_crawler, 'spider2')

crawler = runner.create_crawler('spider1')
self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider))
self.assertEqual(crawler.spidercls.name, 'spider1')

0 comments on commit 57f87b9

Please sign in to comment.