From 9cbbfd8b04835c40568b687ef8b13d901db988cb Mon Sep 17 00:00:00 2001 From: Julia Medina Date: Wed, 6 Aug 2014 08:51:12 -0300 Subject: [PATCH] Adjust spiders' utils to new SpiderManager API --- scrapy/commands/fetch.py | 14 +++++------- scrapy/commands/parse.py | 49 +++++++++++++++++++++------------------- scrapy/commands/shell.py | 26 +++++++++++++++------ scrapy/shell.py | 9 +++----- scrapy/utils/spider.py | 18 +++++++++------ 5 files changed, 65 insertions(+), 51 deletions(-) diff --git a/scrapy/commands/fetch.py b/scrapy/commands/fetch.py index 373d323c75f..ca9fd57f5c7 100644 --- a/scrapy/commands/fetch.py +++ b/scrapy/commands/fetch.py @@ -3,9 +3,8 @@ from scrapy.command import ScrapyCommand from scrapy.http import Request -from scrapy.spider import Spider from scrapy.exceptions import UsageError -from scrapy.utils.spider import create_spider_for_request +from scrapy.utils.spider import spidercls_for_request, DefaultSpider class Command(ScrapyCommand): @@ -48,12 +47,11 @@ def run(self, args, opts): request = Request(args[0], callback=cb, dont_filter=True) request.meta['handle_httpstatus_all'] = True - crawler = self.crawler_process.create_crawler() - spider = None + spidercls = DefaultSpider + spiders = self.crawler_process.spiders if opts.spider: - spider = crawler.spiders.create(opts.spider) + spidercls = spiders.load(opts.spider) else: - spider = create_spider_for_request(crawler.spiders, request, \ - default_spider=Spider('default')) - crawler.crawl(spider, [request]) + spidercls = spidercls_for_request(spiders, request, spidercls) + self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start() diff --git a/scrapy/commands/parse.py b/scrapy/commands/parse.py index 0867a21a04f..01c7fff0a46 100644 --- a/scrapy/commands/parse.py +++ b/scrapy/commands/parse.py @@ -5,7 +5,7 @@ from scrapy.item import BaseItem from scrapy.utils import display from scrapy.utils.conf import arglist_to_dict -from scrapy.utils.spider import iterate_spider_output, create_spider_for_request +from scrapy.utils.spider import iterate_spider_output, spidercls_for_request from scrapy.exceptions import UsageError from scrapy import log @@ -113,41 +113,45 @@ def run_callback(self, response, cb): requests.append(x) return items, requests - def get_callback_from_rules(self, response): - if getattr(self.spider, 'rules', None): - for rule in self.spider.rules: + def get_callback_from_rules(self, spider, response): + if getattr(spider, 'rules', None): + for rule in spider.rules: if rule.link_extractor.matches(response.url) and rule.callback: return rule.callback else: log.msg(format='No CrawlSpider rules found in spider %(spider)r, ' 'please specify a callback to use for parsing', - level=log.ERROR, spider=self.spider.name) + level=log.ERROR, spider=spider.name) - def set_spider(self, url, opts): + def set_spidercls(self, url, opts): + spiders = self.crawler_process.spiders if opts.spider: try: - self.spider = self.pcrawler.spiders.create(opts.spider, **opts.spargs) + self.spidercls = spiders.load(opts.spider) except KeyError: log.msg(format='Unable to find spider: %(spider)s', level=log.ERROR, spider=opts.spider) else: - self.spider = create_spider_for_request(self.pcrawler.spiders, Request(url), **opts.spargs) - if not self.spider: + self.spidercls = spidercls_for_request(spiders, Request(url)) + if not self.spidercls: log.msg(format='Unable to find spider for: %(url)s', level=log.ERROR, url=url) - def start_parsing(self, url, opts): request = Request(url, opts.callback) - request = self.prepare_request(request, opts) + _start_requests = lambda s: [self.prepare_request(s, request, opts)] + self.spidercls.start_requests = _start_requests + - self.pcrawler.crawl(self.spider, [request]) + def start_parsing(self, url, opts): + self.crawler_process.crawl(self.spidercls, **opts.spargs) + self.pcrawler = list(self.crawler_process.crawlers)[0] self.crawler_process.start() if not self.first_response: - log.msg(format='No response downloaded for: %(request)s', - level=log.ERROR, request=request) + log.msg(format='No response downloaded for: %(url)s', + level=log.ERROR, url=url) - def prepare_request(self, request, opts): + def prepare_request(self, spider, request, opts): def callback(response): # memorize first request if not self.first_response: @@ -157,17 +161,17 @@ def callback(response): cb = response.meta['_callback'] if not cb: if opts.rules and self.first_response == response: - cb = self.get_callback_from_rules(response) + cb = self.get_callback_from_rules(spider, response) else: cb = 'parse' if not callable(cb): - cb_method = getattr(self.spider, cb, None) + cb_method = getattr(spider, cb, None) if callable(cb_method): cb = cb_method else: log.msg(format='Cannot find callback %(callback)r in spider: %(spider)s', - callback=callback, spider=self.spider.name, level=log.ERROR) + callback=callback, spider=spider.name, level=log.ERROR) return # parse items and requests @@ -177,7 +181,7 @@ def callback(response): if opts.pipelines: itemproc = self.pcrawler.engine.scraper.itemproc for item in items: - itemproc.process_item(item, self.spider) + itemproc.process_item(item, spider) self.add_items(depth, items) self.add_requests(depth, requests) @@ -207,10 +211,9 @@ def run(self, args, opts): else: url = args[0] - # prepare spider - self.pcrawler = self.crawler_process.create_crawler() - self.set_spider(url, opts) + # prepare spidercls + self.set_spidercls(url, opts) - if self.spider and opts.depth > 0: + if self.spidercls and opts.depth > 0: self.start_parsing(url, opts) self.print_results(opts) diff --git a/scrapy/commands/shell.py b/scrapy/commands/shell.py index ab170e665d9..e4d32c31421 100644 --- a/scrapy/commands/shell.py +++ b/scrapy/commands/shell.py @@ -8,6 +8,9 @@ from scrapy.command import ScrapyCommand from scrapy.shell import Shell +from scrapy.http import Request +from scrapy import log +from scrapy.utils.spider import spidercls_for_request, DefaultSpider class Command(ScrapyCommand): @@ -38,18 +41,27 @@ def update_vars(self, vars): pass def run(self, args, opts): - crawler = self.crawler_process.create_crawler() - url = args[0] if args else None - spider = crawler.spiders.create(opts.spider) if opts.spider else None - - self.crawler_process.start_crawling() + spiders = self.crawler_process.spiders + + spidercls = DefaultSpider + if opts.spider: + spidercls = spiders.load(opts.spider) + elif url: + spidercls = spidercls_for_request(spiders, Request(url), + spidercls, log_multiple=True) + crawler = self.crawler_process._create_logged_crawler(spidercls) + crawler.engine = crawler._create_engine() + crawler.engine.start() + + self.crawler_process._start_logging() self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) - shell.start(url=url, spider=spider) + shell.start(url=url) def _start_crawler_thread(self): - t = Thread(target=self.crawler_process.start_reactor) + t = Thread(target=self.crawler_process._start_reactor, + kwargs={'stop_after_crawl': False}) t.daemon = True t.start() diff --git a/scrapy/shell.py b/scrapy/shell.py index 74eaef40f5f..6c48ef18664 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -21,7 +21,6 @@ from scrapy.utils.console import start_python_console from scrapy.utils.misc import load_object from scrapy.utils.response import open_in_browser -from scrapy.utils.spider import create_spider_for_request class Shell(object): @@ -67,11 +66,9 @@ def _open_spider(self, request, spider): return self.spider if spider is None: - spider = create_spider_for_request(self.crawler.spiders, - request, - Spider('default'), - log_multiple=True) - spider.set_crawler(self.crawler) + spider = self.crawler.spider or self.crawler._create_spider() + + self.crawler.spider = spider self.crawler.engine.open_spider(spider, close_if_idle=False) self.spider = spider return spider diff --git a/scrapy/utils/spider.py b/scrapy/utils/spider.py index 4e43bc13fa7..b81cf2b9bbe 100644 --- a/scrapy/utils/spider.py +++ b/scrapy/utils/spider.py @@ -4,6 +4,7 @@ from scrapy import log from scrapy.item import BaseItem +from scrapy.spider import Spider from scrapy.utils.misc import arg_to_iter @@ -25,21 +26,21 @@ def iter_spider_classes(module): getattr(obj, 'name', None): yield obj -def create_spider_for_request(spidermanager, request, default_spider=None, \ - log_none=False, log_multiple=False, **spider_kwargs): - """Create a spider to handle the given Request. +def spidercls_for_request(spidermanager, request, default_spidercls=None, + log_none=False, log_multiple=False): + """Return a spider class that handles the given Request. This will look for the spiders that can handle the given request (using - the spider manager) and return a (new) Spider if (and only if) there is + the spider manager) and return a Spider class if (and only if) there is only one Spider able to handle the Request. If multiple spiders (or no spider) are found, it will return the - default_spider passed. It can optionally log if multiple or no spiders + default_spidercls passed. It can optionally log if multiple or no spiders are found. """ snames = spidermanager.find_by_request(request) if len(snames) == 1: - return spidermanager.create(snames[0], **spider_kwargs) + return spidermanager.load(snames[0]) if len(snames) > 1 and log_multiple: log.msg(format='More than one spider can handle: %(request)s - %(snames)s', @@ -49,5 +50,8 @@ def create_spider_for_request(spidermanager, request, default_spider=None, \ log.msg(format='Unable to find spider that handles: %(request)s', level=log.ERROR, request=request) - return default_spider + return default_spidercls + +class DefaultSpider(Spider): + name = 'default'