Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added --overwrite-output (-O) option #716

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions scrapy/commands/crawl.py
Expand Up @@ -20,6 +20,8 @@ def add_options(self, parser):
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-O", "--overwrite-output", metavar="FILE",
help="overwrite scraped items into FILE")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")

Expand All @@ -29,6 +31,11 @@ def process_options(self, args, opts):
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.overwrite_output:
if opts.output:
raise UsageError("Please use only one of --output and --overwrite-output")
opts.output = opts.overwrite_output
self.settings.overrides['FEED_OVERWRITE'] = True
if opts.output:
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
Expand Down
7 changes: 7 additions & 0 deletions scrapy/commands/runspider.py
Expand Up @@ -43,6 +43,8 @@ def add_options(self, parser):
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-O", "--overwrite-output", metavar="FILE",
help="overwrite scraped items into FILE")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")

Expand All @@ -52,6 +54,11 @@ def process_options(self, args, opts):
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.overwrite_output:
if opts.output:
raise UsageError("Please use only one of --output and --overwrite-output")
opts.output = opts.overwrite_output
self.settings.overrides['FEED_OVERWRITE'] = True
if opts.output:
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
Expand Down
16 changes: 8 additions & 8 deletions scrapy/contrib/feedexport.py
Expand Up @@ -24,7 +24,7 @@
class IFeedStorage(Interface):
"""Interface that all Feed Storages must implement"""

def __init__(uri):
def __init__(uri, settings):
"""Initialize the storage with the parameters given in the URI"""

def open(spider):
Expand Down Expand Up @@ -53,7 +53,7 @@ class StdoutFeedStorage(object):

implements(IFeedStorage)

def __init__(self, uri, _stdout=sys.stdout):
def __init__(self, uri, settings, _stdout=sys.stdout):
self._stdout = _stdout

def open(self, spider):
Expand All @@ -66,22 +66,22 @@ class FileFeedStorage(object):

implements(IFeedStorage)

def __init__(self, uri):
def __init__(self, uri, settings):
self.path = file_uri_to_path(uri)
self.overwrite = settings['FEED_OVERWRITE']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think settings.getbool('FEED_OVERWRITE', default=False) fits better.


def open(self, spider):
dirname = os.path.dirname(self.path)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
return open(self.path, 'ab')
return open(self.path, 'wb' if self.overwrite else 'ab')

def store(self, file):
file.close()

class S3FeedStorage(BlockingFeedStorage):

def __init__(self, uri):
from scrapy.conf import settings
def __init__(self, uri, settings):
try:
import boto
except ImportError:
Expand All @@ -105,7 +105,7 @@ def _store_in_thread(self, file):
class FTPFeedStorage(BlockingFeedStorage):

def __init__(self, uri):
u = urlparse(uri)
u = urlparse(uri, settings)
self.host = u.hostname
self.port = int(u.port or '21')
self.username = u.username
Expand Down Expand Up @@ -222,7 +222,7 @@ def _get_exporter(self, *a, **kw):
return self.exporters[self.format](*a, **kw)

def _get_storage(self, uri):
return self.storages[urlparse(uri).scheme](uri)
return self.storages[urlparse(uri).scheme](uri, self.settings)

def _get_uri_params(self, spider):
params = {}
Expand Down
1 change: 1 addition & 0 deletions scrapy/settings/default_settings.py
Expand Up @@ -121,6 +121,7 @@
}

FEED_URI = None
FEED_OVERWRITE = False
FEED_URI_PARAMS = None # a function to extend uri arguments
FEED_FORMAT = 'jsonlines'
FEED_STORE_EMPTY = False
Expand Down
42 changes: 31 additions & 11 deletions scrapy/tests/test_contrib_feedexport.py
Expand Up @@ -12,38 +12,58 @@

class FileFeedStorageTest(unittest.TestCase):

settings = {"FEED_OVERWRITE": False}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be created inside setUp, because if it is modified on a method will affect the following tests.


def test_store_file_uri(self):
path = os.path.abspath(self.mktemp())
uri = path_to_file_uri(path)
return self._assert_stores(FileFeedStorage(uri), path)
return self._assert_stores(FileFeedStorage(uri, self.settings), path)

def test_store_file_uri_makedirs(self):
path = os.path.abspath(self.mktemp())
path = os.path.join(path, 'more', 'paths', 'file.txt')
uri = path_to_file_uri(path)
return self._assert_stores(FileFeedStorage(uri), path)
return self._assert_stores(FileFeedStorage(uri, self.settings), path)

def test_store_direct_path(self):
path = os.path.abspath(self.mktemp())
return self._assert_stores(FileFeedStorage(path), path)
return self._assert_stores(FileFeedStorage(path, self.settings), path)

def test_store_direct_path_relative(self):
path = self.mktemp()
return self._assert_stores(FileFeedStorage(path), path)
return self._assert_stores(FileFeedStorage(path, self.settings), path)

def test_interface(self):
path = self.mktemp()
st = FileFeedStorage(path)
st = FileFeedStorage(path, self.settings)
verifyObject(IFeedStorage, st)

@defer.inlineCallbacks
def _assert_stores(self, storage, path):
def _store(self, path, settings, content="content"):
storage = FileFeedStorage(path, settings)
spider = Spider("default")
file = storage.open(spider)
file.write("content")
storage.store(file)

def test_append(self):
path = os.path.abspath(self.mktemp())
self._store(path, self.settings)
return self._assert_stores(FileFeedStorage(path, self.settings), path, verify="contentcontent")

def test_overwrite(self):
path = os.path.abspath(self.mktemp())
settings = {"FEED_OVERWRITE": True}
self._store(path, settings)
return self._assert_stores(FileFeedStorage(path, settings), path)

@defer.inlineCallbacks
def _assert_stores(self, storage, path, content="content", verify="content"):
spider = Spider("default")
file = storage.open(spider)
file.write(content)
yield storage.store(file)
self.failUnless(os.path.exists(path))
self.failUnlessEqual(open(path).read(), "content")
self.failUnlessEqual(open(path).read(), verify)


class FTPFeedStorageTest(unittest.TestCase):
Expand All @@ -53,7 +73,7 @@ def test_store(self):
path = os.environ.get('FEEDTEST_FTP_PATH')
if not (uri and path):
raise unittest.SkipTest("No FTP server available for testing")
st = FTPFeedStorage(uri)
st = FTPFeedStorage(uri, settings={})
verifyObject(IFeedStorage, st)
return self._assert_stores(st, path)

Expand All @@ -79,7 +99,7 @@ def test_store(self):
if not uri:
raise unittest.SkipTest("No S3 URI available for testing")
from boto import connect_s3
storage = S3FeedStorage(uri)
storage = S3FeedStorage(uri, settings={})
verifyObject(IFeedStorage, storage)
file = storage.open(Spider("default"))
file.write("content")
Expand All @@ -93,7 +113,7 @@ class StdoutFeedStorageTest(unittest.TestCase):
@defer.inlineCallbacks
def test_store(self):
out = StringIO()
storage = StdoutFeedStorage('stdout:', _stdout=out)
storage = StdoutFeedStorage('stdout:', settings={} ,_stdout=out)
file = storage.open(Spider("default"))
file.write("content")
yield storage.store(file)
Expand Down