Skip to content

Commit

Permalink
add --depth field to parse command
Browse files Browse the repository at this point in the history
  • Loading branch information
alexcepoi committed May 24, 2012
1 parent 8d77005 commit 8e3c5f1
Showing 1 changed file with 81 additions and 47 deletions.
128 changes: 81 additions & 47 deletions scrapy/commands/parse.py
Expand Up @@ -11,6 +11,12 @@ class Command(ScrapyCommand):

requires_project = True

spider = None
items = []
requests = []

first_response = None

def syntax(self):
return "[options] <url>"

Expand All @@ -31,79 +37,107 @@ def add_options(self, parser):
help="use CrawlSpider rules to discover the callback")
parser.add_option("-c", "--callback", dest="callback", \
help="use this callback for parsing, instead looking for a callback")
parser.add_option("-d", "--depth", dest="depth", type="int", default=1, \
help="maximum depth for parsing requests [default: %default]")
# parser.add_option("-v", "--verbose", dest="verbose", action="store_true", \
# help="print each depth level one by one")

def pipeline_process(self, item, spider, opts):
return item

def run_callback(self, spider, response, callback, opts):
cb = callback if callable(callback) else getattr(spider, callback, None)
if not cb:
log.msg('Cannot find callback %r in spider: %s' % (callback, spider.name))
return (), ()

def run_callback(self, response, cb):
items, requests = [], []

for x in iterate_spider_output(cb(response)):
if isinstance(x, BaseItem):
items.append(x)
elif isinstance(x, Request):
requests.append(x)
return items, requests

def get_callback_from_rules(self, spider, response):
if getattr(spider, 'rules', None):
for rule in spider.rules:
def get_callback_from_rules(self, response):
if getattr(self.spider, 'rules', None):
for rule in self.spider.rules:
if rule.link_extractor.matches(response.url) and rule.callback:
return rule.callback
else:
log.msg("No CrawlSpider rules found in spider %r, please specify "
"a callback to use for parsing" % spider.name, log.ERROR)
"a callback to use for parsing" % self.spider.name, log.ERROR)

def print_results(self, items, requests, cb_name, opts):
def print_results(self, opts):
if not opts.noitems:
print "# Scraped Items - callback: %s" % cb_name, "-"*60
display.pprint([dict(x) for x in items], colorize=not opts.nocolour)
print "# Scraped Items ", "-"*60
display.pprint([dict(x) for x in self.items], colorize=not opts.nocolour)
if not opts.nolinks:
print "# Requests - callback: %s" % cb_name, "-"*68
display.pprint(requests, colorize=not opts.nocolour)
print "# Requests ", "-"*65
display.pprint(self.requests, colorize=not opts.nocolour)

def get_spider(self, request, opts):
def set_spider(self, url, opts):
if opts.spider:
try:
return self.crawler.spiders.create(opts.spider)
self.spider = self.crawler.spiders.create(opts.spider)
except KeyError:
log.msg('Unable to find spider: %s' % opts.spider, log.ERROR)
else:
spider = create_spider_for_request(self.crawler.spiders, request)
if spider:
return spider
log.msg('Unable to find spider for: %s' % request, log.ERROR)

def get_response_and_spider(self, url, opts):
responses = [] # to collect downloaded responses
request = Request(url, callback=responses.append)
spider = self.get_spider(request, opts)
if not spider:
return None, None
self.crawler.crawl(spider, [request])
self.spider = create_spider_for_request(self.crawler.spiders, url)
if not self.spider:
log.msg('Unable to find spider for: %s' % request, log.ERROR)

def start_parsing(self, url, opts):
request = Request(url, opts.callback)
request = self.prepare_request(request, opts)

self.crawler.crawl(self.spider, [request])
self.crawler.start()
if not responses:

if not self.first_response:
log.msg('No response downloaded for: %s' % request, log.ERROR, \
spider=spider)
return None, None
return responses[0], spider
spider=self.spider)

def prepare_request(self, request, opts):
def callback(response):
# memorize first request
if not self.first_response:
self.first_response = response

# determine real callback
cb = response.meta['_callback']
if not cb:
if opts.rules and self.first_response == response:
cb = self.get_callback_from_rules(response)
else:
cb = 'parse'

cb = cb if callable(cb) else getattr(self.spider, cb, None)
if not cb:
log.msg('Cannot find callback %r in spider: %s' % (callback, spider.name))

# parse items and requests
items, requests = self.run_callback(response, cb)
self.items += items

depth = response.meta['_depth']
if depth < opts.depth:
for req in requests:
req.meta['_depth'] = depth + 1
req.meta['_callback'] = req.callback
req.callback = callback
return requests
else:
self.requests += requests

request.meta['_depth'] = 1
request.meta['_callback'] = request.callback
request.callback = callback
return request

def run(self, args, opts):
# parse arguments
if not len(args) == 1 or not is_url(args[0]):
raise UsageError()
response, spider = self.get_response_and_spider(args[0], opts)
if not response:
return
callback = None
if opts.callback:
callback = opts.callback
elif opts.rules:
callback = self.get_callback_from_rules(spider, response)
items, requests = self.run_callback(spider, response, callback or 'parse', \
opts)
self.print_results(items, requests, callback, opts)
else:
url = args[0]

# prepare spider
self.set_spider(url, opts)

if self.spider and opts.depth > 0:
self.start_parsing(url, opts)
self.print_results(opts)

0 comments on commit 8e3c5f1

Please sign in to comment.