Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Resolved issue #546. Output format parsing from filename extension. #659

Merged
merged 5 commits into from Apr 14, 2014
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 9 additions & 6 deletions scrapy/commands/crawl.py
Expand Up @@ -2,6 +2,7 @@
from scrapy.utils.conf import arglist_to_dict
from scrapy.exceptions import UsageError


class Command(ScrapyCommand):

requires_project = True
Expand All @@ -14,12 +15,12 @@ def short_desc(self):

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", \
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE", \
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT", default="jsonlines", \
help="format to use for dumping items with -o (default: %default)")
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")

def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
Expand All @@ -33,6 +34,8 @@ def process_options(self, args, opts):
else:
self.settings.overrides['FEED_URI'] = opts.output
valid_output_formats = self.settings['FEED_EXPORTERS'].keys() + self.settings['FEED_EXPORTERS_BASE'].keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace(".", "")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will raise an unhelpful exception when user runs scrapy crawl myspider -o ./data.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I'm not noticed what os not imported.
After import:
Runspider

~/dev/spider/pws/spiders ❯ ~/dev/scrapy/bin/scrapy runspider pws_spider.py -o ./data.                                                 
Usage
=====
  scrapy runspider [options] <spider_file>

runspider: error: Invalid/unrecognized output format: , Expected ['xml', 'jsonlines', 'json', 'csv', 'pickle', 'marshal']

Crawl:

~/dev/spider/pws/spiders ❯ ~/dev/scrapy/bin/scrapy crawl pws_spider.py -o ./data.                                                    
Usage
=====
  scrapy crawl [options] <spider>

crawl: error: Invalid/unrecognized output format: , Expected ['xml', 'jsonlines', 'json', 'csv', 'pickle', 'marshal']

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about

scrapy crawl myspider -o ./data

(without trailing dot)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function os.path.splitext() will return tuple of file name and file extension. Then we select file extension and replace dot. Of course may be needed replace only first dot, but I not see file types with double extension.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you're right.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without trailing dot:

~/dev/spider/pws/spiders ❯ ~/dev/scrapy/bin/scrapy crawl pws_spider.py -o ./data                                                      
Usage
=====
  scrapy crawl [options] <spider>

crawl: error: Invalid/unrecognized output format: , Expected ['xml', 'jsonlines', 'json', 'csv', 'pickle', 'marshal']

Result of splitext of ./data will be empty file ext.

>>> import os
>>> fname = "./data"
>>> print os.path.splitext(fname)
('./data', '')

if opts.output_format not in valid_output_formats:
raise UsageError('Invalid/unrecognized output format: %s, Expected %s' % (opts.output_format, valid_output_formats))
self.settings.overrides['FEED_FORMAT'] = opts.output_format
Expand Down
16 changes: 9 additions & 7 deletions scrapy/commands/runspider.py
Expand Up @@ -37,12 +37,12 @@ def long_desc(self):

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", \
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE", \
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT", default="jsonlines", \
help="format to use for dumping items with -o (default: %default)")
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")

def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
Expand All @@ -56,8 +56,10 @@ def process_options(self, args, opts):
else:
self.settings.overrides['FEED_URI'] = opts.output
valid_output_formats = self.settings['FEED_EXPORTERS'].keys() + self.settings['FEED_EXPORTERS_BASE'].keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace(".", "")
if opts.output_format not in valid_output_formats:
raise UsageError('Invalid/unrecognized output format: %s, Expected %s' % (opts.output_format,valid_output_formats))
raise UsageError('Invalid/unrecognized output format: %s, Expected %s' % (opts.output_format, valid_output_formats))
self.settings.overrides['FEED_FORMAT'] = opts.output_format

def run(self, args, opts):
Expand Down