Skip to content

Commit

Permalink
More core changes:
Browse files Browse the repository at this point in the history
* removed execution queue (replaced by newer spider queues)
* added real support for returning iterators in Spider.start_requests()
* removed support for passing urls to 'scrapy crawl' command
  • Loading branch information
pablohoffman committed Jul 15, 2011
1 parent 4dadeb7 commit 84f518f
Show file tree
Hide file tree
Showing 14 changed files with 37 additions and 346 deletions.
10 changes: 2 additions & 8 deletions docs/topics/commands.rst
Expand Up @@ -218,22 +218,16 @@ Usage example::
crawl
-----

* Syntax: ``scrapy crawl <spider|url>``
* Syntax: ``scrapy crawl <spider>``
* Requires project: *yes*

Start crawling a spider. If a URL is passed instead of a spider, it will start
from that URL instead of the spider start urls.
Start crawling a spider.

Usage examples::

$ scrapy crawl example.com
[ ... example.com spider starts crawling ... ]

$ scrapy crawl myspider
[ ... myspider starts crawling ... ]

$ scrapy crawl http://example.com/some/page.html
[ ... spider that handles example.com starts crawling from that url ... ]

.. command:: server

Expand Down
61 changes: 6 additions & 55 deletions scrapy/commands/crawl.py
@@ -1,82 +1,33 @@
from w3lib.url import is_url

from scrapy import log
from scrapy.command import ScrapyCommand
from scrapy.conf import settings
from scrapy.http import Request
from scrapy.utils.conf import arglist_to_dict
from scrapy.exceptions import UsageError

from collections import defaultdict

class Command(ScrapyCommand):

requires_project = True

def syntax(self):
return "[options] <spider|url> ..."
return "[options] <spider>"

def short_desc(self):
return "Start crawling from a spider or URL"

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--spider", dest="spider", default=None, \
help="always use this spider when arguments are urls")
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", \
help="set spider argument (may be repeated)")
parser.add_option("-n", "--nofollow", dest="nofollow", action="store_true", \
help="don't follow links (for use with URLs only)")

def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.nofollow:
settings.overrides['CRAWLSPIDER_FOLLOW_LINKS'] = False

def run(self, args, opts):
q = self.crawler.queue
urls, names = self._split_urls_and_names(args)
for name in names:
q.append_spider_name(name, **opts.spargs)

if opts.spider:
try:
spider = self.crawler.spiders.create(opts.spider, **opts.spargs)
for url in urls:
q.append_url(url, spider)
except KeyError:
log.msg('Unable to find spider: %s' % opts.spider, log.ERROR)
else:
for name, urls in self._group_urls_by_spider(urls):
spider = self.crawler.spiders.create(name, **opts.spargs)
for url in urls:
q.append_url(url, spider)
if len(args) < 1:
raise UsageError()
for spname in args:
spider = self.crawler.spiders.create(spname, **opts.spargs)
self.crawler.crawl(spider)
self.crawler.start()

def _group_urls_by_spider(self, urls):
spider_urls = defaultdict(list)
for url in urls:
spider_names = self.crawler.spiders.find_by_request(Request(url))
if not spider_names:
log.msg('Could not find spider that handles url: %s' % url,
log.ERROR)
elif len(spider_names) > 1:
log.msg('More than one spider can handle url: %s - %s' % \
(url, ", ".join(spider_names)), log.ERROR)
else:
spider_urls[spider_names[0]].append(url)
return spider_urls.items()

def _split_urls_and_names(self, args):
urls = []
names = []
for arg in args:
if is_url(arg):
urls.append(arg)
else:
names.append(arg)
return urls, names
14 changes: 6 additions & 8 deletions scrapy/commands/fetch.py
@@ -1,10 +1,10 @@
from w3lib.url import is_url

from scrapy import log
from scrapy.command import ScrapyCommand
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.exceptions import UsageError
from scrapy.utils.spider import create_spider_for_request

class Command(ScrapyCommand):

Expand Down Expand Up @@ -49,12 +49,10 @@ def run(self, args, opts):

spider = None
if opts.spider:
try:
spider = self.crawler.spiders.create(opts.spider)
except KeyError:
log.msg("Could not find spider: %s" % opts.spider, log.ERROR)

self.crawler.queue.append_request(request, spider, \
default_spider=BaseSpider('default'))
spider = self.crawler.spiders.create(opts.spider)
else:
spider = create_spider_for_request(self.crawler.spiders, request, \
default_spider=BaseSpider('default'))
self.crawler.crawl(spider, [request])
self.crawler.start()

2 changes: 1 addition & 1 deletion scrapy/commands/parse.py
Expand Up @@ -84,7 +84,7 @@ def get_response_and_spider(self, url, opts):
spider = self.get_spider(request, opts)
if not spider:
return None, None
self.crawler.queue.append_request(request, spider)
self.crawler.crawl(spider, [request])
self.crawler.start()
if not responses:
log.msg('No response downloaded for: %s' % request, log.ERROR, \
Expand Down
4 changes: 2 additions & 2 deletions scrapy/commands/runspider.py
Expand Up @@ -60,6 +60,6 @@ def run(self, args, opts):
if not spclasses:
raise UsageError("No spider found in file: %s\n" % filename)
spider = spclasses.pop()(**opts.spargs)
# schedule spider and start engine
self.crawler.queue.append_spider(spider)

self.crawler.crawl(spider)
self.crawler.start()
2 changes: 1 addition & 1 deletion scrapy/commands/shell.py
Expand Up @@ -11,7 +11,7 @@
class Command(ScrapyCommand):

requires_project = False
default_settings = {'KEEP_ALIVE': True}
default_settings = {'KEEP_ALIVE': True, 'LOGSTATS_INTERVAL': 0}

def syntax(self):
return "[url|file]"
Expand Down
16 changes: 12 additions & 4 deletions scrapy/core/engine.py
Expand Up @@ -21,9 +21,11 @@

class Slot(object):

def __init__(self):
def __init__(self, start_requests, close_if_idle):
self.closing = False
self.inprogress = set() # requests in progress
self.requests = iter(start_requests)
self.close_if_idle = close_if_idle

def add_request(self, request):
self.inprogress.add(request)
Expand Down Expand Up @@ -107,7 +109,13 @@ def next_request(self, spider, now=False):
break

if self.spider_is_idle(spider):
self._spider_idle(spider)
slot = self.slots[spider]
try:
request = slot.requests.next()
self.crawl(request, spider)
except StopIteration:
if slot.close_if_idle:
self._spider_idle(spider)

def _needs_backout(self, spider):
slot = self.slots[spider]
Expand Down Expand Up @@ -212,11 +220,11 @@ def _on_complete(_):
return dwld

@defer.inlineCallbacks
def open_spider(self, spider):
def open_spider(self, spider, start_requests=None, close_if_idle=True):
assert self.has_capacity(), "No free spider slots when opening %r" % \
spider.name
log.msg("Spider opened", spider=spider)
self.slots[spider] = Slot()
self.slots[spider] = Slot(start_requests or (), close_if_idle)
yield self.scheduler.open_spider(spider)
self.downloader.open_spider(spider)
yield self.scraper.open_spider(spider)
Expand Down
37 changes: 5 additions & 32 deletions scrapy/crawler.py
Expand Up @@ -3,7 +3,6 @@
from twisted.internet import reactor, defer

from scrapy.xlib.pydispatch import dispatcher
from scrapy.queue import ExecutionQueue
from scrapy.core.engine import ExecutionEngine
from scrapy.extension import ExtensionManager
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
Expand Down Expand Up @@ -34,51 +33,25 @@ def configure(self):
self.extensions = ExtensionManager.from_settings(self.settings)
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
self.spiders = spman_cls.from_settings(self.settings)
spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS'])
spq = spq_cls.from_settings(self.settings)
keepalive = self.settings.getbool('KEEP_ALIVE')
pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL')
self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint,
keep_alive=keepalive)
self.engine = ExecutionEngine(self.settings, self._spider_closed)

@defer.inlineCallbacks
def _start_next_spider(self):
spider, requests = yield defer.maybeDeferred(self.queue.get_next)
if spider:
self._start_spider(spider, requests)
if self.engine.has_capacity() and not self._nextcall.active():
self._nextcall = reactor.callLater(self.queue.poll_interval, \
self._spider_closed)

@defer.inlineCallbacks
def _start_spider(self, spider, requests):
"""Don't call this method. Use self.queue to start new spiders"""
def crawl(self, spider, requests=None):
spider.set_crawler(self)
yield defer.maybeDeferred(self.engine.open_spider, spider)
for request in requests:
self.engine.crawl(request, spider)
if requests is None:
requests = spider.start_requests()
return self.engine.open_spider(spider, requests)

@defer.inlineCallbacks
def _spider_closed(self, spider=None):
if not self.engine.open_spiders:
is_finished = yield defer.maybeDeferred(self.queue.is_finished)
if is_finished:
self.stop()
return
if self.engine.has_capacity():
self._start_next_spider()
self.stop()

@defer.inlineCallbacks
def start(self):
yield defer.maybeDeferred(self.configure)
yield defer.maybeDeferred(self.engine.start)
self._nextcall = reactor.callLater(0, self._start_next_spider)

@defer.inlineCallbacks
def stop(self):
if self._nextcall.active():
self._nextcall.cancel()
if self.engine.running:
yield defer.maybeDeferred(self.engine.stop)

Expand Down
96 changes: 0 additions & 96 deletions scrapy/queue.py

This file was deleted.

4 changes: 0 additions & 4 deletions scrapy/settings/default_settings.py
Expand Up @@ -174,8 +174,6 @@
# Item pipelines are typically set in specific commands settings
ITEM_PIPELINES = []

KEEP_ALIVE = False

LOG_ENABLED = True
LOG_ENCODING = 'utf-8'
LOG_FORMATTER = 'scrapy.logformatter.LogFormatter'
Expand Down Expand Up @@ -203,8 +201,6 @@

NEWSPIDER_MODULE = ''

QUEUE_POLL_INTERVAL = 5

RANDOMIZE_DOWNLOAD_DELAY = True

REDIRECT_ENABLED = True
Expand Down

0 comments on commit 84f518f

Please sign in to comment.