Skip to content

Commit

Permalink
Adjust spiders' utils to new SpiderManager API
Browse files Browse the repository at this point in the history
  • Loading branch information
curita committed Aug 12, 2014
1 parent 900a487 commit 9cbbfd8
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 51 deletions.
14 changes: 6 additions & 8 deletions scrapy/commands/fetch.py
Expand Up @@ -3,9 +3,8 @@

from scrapy.command import ScrapyCommand
from scrapy.http import Request
from scrapy.spider import Spider
from scrapy.exceptions import UsageError
from scrapy.utils.spider import create_spider_for_request
from scrapy.utils.spider import spidercls_for_request, DefaultSpider

class Command(ScrapyCommand):

Expand Down Expand Up @@ -48,12 +47,11 @@ def run(self, args, opts):
request = Request(args[0], callback=cb, dont_filter=True)
request.meta['handle_httpstatus_all'] = True

crawler = self.crawler_process.create_crawler()
spider = None
spidercls = DefaultSpider
spiders = self.crawler_process.spiders
if opts.spider:
spider = crawler.spiders.create(opts.spider)
spidercls = spiders.load(opts.spider)
else:
spider = create_spider_for_request(crawler.spiders, request, \
default_spider=Spider('default'))
crawler.crawl(spider, [request])
spidercls = spidercls_for_request(spiders, request, spidercls)
self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
self.crawler_process.start()
49 changes: 26 additions & 23 deletions scrapy/commands/parse.py
Expand Up @@ -5,7 +5,7 @@
from scrapy.item import BaseItem
from scrapy.utils import display
from scrapy.utils.conf import arglist_to_dict
from scrapy.utils.spider import iterate_spider_output, create_spider_for_request
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
from scrapy.exceptions import UsageError
from scrapy import log

Expand Down Expand Up @@ -113,41 +113,45 @@ def run_callback(self, response, cb):
requests.append(x)
return items, requests

def get_callback_from_rules(self, response):
if getattr(self.spider, 'rules', None):
for rule in self.spider.rules:
def get_callback_from_rules(self, spider, response):
if getattr(spider, 'rules', None):
for rule in spider.rules:
if rule.link_extractor.matches(response.url) and rule.callback:
return rule.callback
else:
log.msg(format='No CrawlSpider rules found in spider %(spider)r, '
'please specify a callback to use for parsing',
level=log.ERROR, spider=self.spider.name)
level=log.ERROR, spider=spider.name)

def set_spider(self, url, opts):
def set_spidercls(self, url, opts):
spiders = self.crawler_process.spiders
if opts.spider:
try:
self.spider = self.pcrawler.spiders.create(opts.spider, **opts.spargs)
self.spidercls = spiders.load(opts.spider)
except KeyError:
log.msg(format='Unable to find spider: %(spider)s',
level=log.ERROR, spider=opts.spider)
else:
self.spider = create_spider_for_request(self.pcrawler.spiders, Request(url), **opts.spargs)
if not self.spider:
self.spidercls = spidercls_for_request(spiders, Request(url))
if not self.spidercls:
log.msg(format='Unable to find spider for: %(url)s',
level=log.ERROR, url=url)

def start_parsing(self, url, opts):
request = Request(url, opts.callback)
request = self.prepare_request(request, opts)
_start_requests = lambda s: [self.prepare_request(s, request, opts)]
self.spidercls.start_requests = _start_requests


self.pcrawler.crawl(self.spider, [request])
def start_parsing(self, url, opts):
self.crawler_process.crawl(self.spidercls, **opts.spargs)
self.pcrawler = list(self.crawler_process.crawlers)[0]
self.crawler_process.start()

if not self.first_response:
log.msg(format='No response downloaded for: %(request)s',
level=log.ERROR, request=request)
log.msg(format='No response downloaded for: %(url)s',
level=log.ERROR, url=url)

def prepare_request(self, request, opts):
def prepare_request(self, spider, request, opts):
def callback(response):
# memorize first request
if not self.first_response:
Expand All @@ -157,17 +161,17 @@ def callback(response):
cb = response.meta['_callback']
if not cb:
if opts.rules and self.first_response == response:
cb = self.get_callback_from_rules(response)
cb = self.get_callback_from_rules(spider, response)
else:
cb = 'parse'

if not callable(cb):
cb_method = getattr(self.spider, cb, None)
cb_method = getattr(spider, cb, None)
if callable(cb_method):
cb = cb_method
else:
log.msg(format='Cannot find callback %(callback)r in spider: %(spider)s',
callback=callback, spider=self.spider.name, level=log.ERROR)
callback=callback, spider=spider.name, level=log.ERROR)
return

# parse items and requests
Expand All @@ -177,7 +181,7 @@ def callback(response):
if opts.pipelines:
itemproc = self.pcrawler.engine.scraper.itemproc
for item in items:
itemproc.process_item(item, self.spider)
itemproc.process_item(item, spider)
self.add_items(depth, items)
self.add_requests(depth, requests)

Expand Down Expand Up @@ -207,10 +211,9 @@ def run(self, args, opts):
else:
url = args[0]

# prepare spider
self.pcrawler = self.crawler_process.create_crawler()
self.set_spider(url, opts)
# prepare spidercls
self.set_spidercls(url, opts)

if self.spider and opts.depth > 0:
if self.spidercls and opts.depth > 0:
self.start_parsing(url, opts)
self.print_results(opts)
26 changes: 19 additions & 7 deletions scrapy/commands/shell.py
Expand Up @@ -8,6 +8,9 @@

from scrapy.command import ScrapyCommand
from scrapy.shell import Shell
from scrapy.http import Request
from scrapy import log
from scrapy.utils.spider import spidercls_for_request, DefaultSpider


class Command(ScrapyCommand):
Expand Down Expand Up @@ -38,18 +41,27 @@ def update_vars(self, vars):
pass

def run(self, args, opts):
crawler = self.crawler_process.create_crawler()

url = args[0] if args else None
spider = crawler.spiders.create(opts.spider) if opts.spider else None

self.crawler_process.start_crawling()
spiders = self.crawler_process.spiders

spidercls = DefaultSpider
if opts.spider:
spidercls = spiders.load(opts.spider)
elif url:
spidercls = spidercls_for_request(spiders, Request(url),
spidercls, log_multiple=True)
crawler = self.crawler_process._create_logged_crawler(spidercls)
crawler.engine = crawler._create_engine()
crawler.engine.start()

self.crawler_process._start_logging()
self._start_crawler_thread()

shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
shell.start(url=url, spider=spider)
shell.start(url=url)

def _start_crawler_thread(self):
t = Thread(target=self.crawler_process.start_reactor)
t = Thread(target=self.crawler_process._start_reactor,
kwargs={'stop_after_crawl': False})
t.daemon = True
t.start()
9 changes: 3 additions & 6 deletions scrapy/shell.py
Expand Up @@ -21,7 +21,6 @@
from scrapy.utils.console import start_python_console
from scrapy.utils.misc import load_object
from scrapy.utils.response import open_in_browser
from scrapy.utils.spider import create_spider_for_request


class Shell(object):
Expand Down Expand Up @@ -67,11 +66,9 @@ def _open_spider(self, request, spider):
return self.spider

if spider is None:
spider = create_spider_for_request(self.crawler.spiders,
request,
Spider('default'),
log_multiple=True)
spider.set_crawler(self.crawler)
spider = self.crawler.spider or self.crawler._create_spider()

self.crawler.spider = spider
self.crawler.engine.open_spider(spider, close_if_idle=False)
self.spider = spider
return spider
Expand Down
18 changes: 11 additions & 7 deletions scrapy/utils/spider.py
Expand Up @@ -4,6 +4,7 @@

from scrapy import log
from scrapy.item import BaseItem
from scrapy.spider import Spider
from scrapy.utils.misc import arg_to_iter


Expand All @@ -25,21 +26,21 @@ def iter_spider_classes(module):
getattr(obj, 'name', None):
yield obj

def create_spider_for_request(spidermanager, request, default_spider=None, \
log_none=False, log_multiple=False, **spider_kwargs):
"""Create a spider to handle the given Request.
def spidercls_for_request(spidermanager, request, default_spidercls=None,
log_none=False, log_multiple=False):
"""Return a spider class that handles the given Request.
This will look for the spiders that can handle the given request (using
the spider manager) and return a (new) Spider if (and only if) there is
the spider manager) and return a Spider class if (and only if) there is
only one Spider able to handle the Request.
If multiple spiders (or no spider) are found, it will return the
default_spider passed. It can optionally log if multiple or no spiders
default_spidercls passed. It can optionally log if multiple or no spiders
are found.
"""
snames = spidermanager.find_by_request(request)
if len(snames) == 1:
return spidermanager.create(snames[0], **spider_kwargs)
return spidermanager.load(snames[0])

if len(snames) > 1 and log_multiple:
log.msg(format='More than one spider can handle: %(request)s - %(snames)s',
Expand All @@ -49,5 +50,8 @@ def create_spider_for_request(spidermanager, request, default_spider=None, \
log.msg(format='Unable to find spider that handles: %(request)s',
level=log.ERROR, request=request)

return default_spider
return default_spidercls


class DefaultSpider(Spider):
name = 'default'

0 comments on commit 9cbbfd8

Please sign in to comment.