New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG+1] add FEED_STORAGE_S3_ACL setting #3607
Changes from 15 commits
0135680
ad83ffd
126207f
e0f34be
7b83ed7
e25b9a2
dbeb088
079af88
ceae356
cfd183a
f824f5b
1eac2a1
7c9f0bd
c2dede2
984e706
b4d132b
dc0b643
ea8be62
9b8ba4c
9fed6fc
fda1d04
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -93,7 +93,7 @@ def store(self, file): | |
|
||
class S3FeedStorage(BlockingFeedStorage): | ||
|
||
def __init__(self, uri, access_key=None, secret_key=None): | ||
def __init__(self, uri, access_key=None, secret_key=None, acl=None): | ||
# BEGIN Backwards compatibility for initialising without keys (and | ||
# without using from_crawler) | ||
no_defaults = access_key is None and secret_key is None | ||
|
@@ -118,6 +118,7 @@ def __init__(self, uri, access_key=None, secret_key=None): | |
self.secret_key = u.password or secret_key | ||
self.is_botocore = is_botocore() | ||
self.keyname = u.path[1:] # remove first "/" | ||
self.acl = acl | ||
if self.is_botocore: | ||
import botocore.session | ||
session = botocore.session.get_session() | ||
|
@@ -130,19 +131,26 @@ def __init__(self, uri, access_key=None, secret_key=None): | |
|
||
@classmethod | ||
def from_crawler(cls, crawler, uri): | ||
return cls(uri, crawler.settings['AWS_ACCESS_KEY_ID'], | ||
crawler.settings['AWS_SECRET_ACCESS_KEY']) | ||
return cls( | ||
uri=uri, | ||
access_key=crawler.settings['AWS_ACCESS_KEY_ID'], | ||
secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'], | ||
acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None | ||
) | ||
|
||
def _store_in_thread(self, file): | ||
file.seek(0) | ||
if self.is_botocore: | ||
kwargs = {'ACL': self.acl} if self.acl else {} | ||
self.s3_client.put_object( | ||
Bucket=self.bucketname, Key=self.keyname, Body=file) | ||
Bucket=self.bucketname, Key=self.keyname, Body=file, | ||
**kwargs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reasons to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm just trying to avoid changing the previous behavior. I'm not sure about the side-effects of explicitly passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. Got it. |
||
else: | ||
conn = self.connect_s3(self.access_key, self.secret_key) | ||
bucket = conn.get_bucket(self.bucketname, validate=False) | ||
key = bucket.new_key(self.keyname) | ||
key.set_contents_from_file(file) | ||
kwargs = {'policy': self.acl} if self.acl else {} | ||
key.set_contents_from_file(file, **kwargs) | ||
key.close() | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
from tests.mockserver import MockServer | ||
from w3lib.url import path_to_file_uri | ||
|
||
import botocore.client | ||
import scrapy | ||
from scrapy.exporters import CsvItemExporter | ||
from scrapy.extensions.feedexport import ( | ||
|
@@ -186,6 +187,144 @@ def test_store(self): | |
content = get_s3_content_and_delete(u.hostname, u.path[1:]) | ||
self.assertEqual(content, expected_content) | ||
|
||
def test_init_without_acl(self): | ||
storage = S3FeedStorage( | ||
's3://mybucket/export.csv', | ||
'access_key', | ||
'secret_key' | ||
) | ||
self.assertEqual(storage.access_key, 'access_key') | ||
self.assertEqual(storage.secret_key, 'secret_key') | ||
self.assertEqual(storage.acl, None) | ||
|
||
def test_init_with_acl(self): | ||
storage = S3FeedStorage( | ||
's3://mybucket/export.csv', | ||
'access_key', | ||
'secret_key', | ||
'custom-acl' | ||
) | ||
self.assertEqual(storage.access_key, 'access_key') | ||
self.assertEqual(storage.secret_key, 'secret_key') | ||
self.assertEqual(storage.acl, 'custom-acl') | ||
|
||
def test_from_crawler_without_acl(self): | ||
settings = { | ||
'AWS_ACCESS_KEY_ID': 'access_key', | ||
'AWS_SECRET_ACCESS_KEY': 'secret_key', | ||
} | ||
crawler = get_crawler(settings_dict=settings) | ||
storage = S3FeedStorage.from_crawler( | ||
crawler, | ||
's3://mybucket/export.csv' | ||
) | ||
self.assertEqual(storage.access_key, 'access_key') | ||
self.assertEqual(storage.secret_key, 'secret_key') | ||
self.assertEqual(storage.acl, None) | ||
|
||
def test_from_crawler_with_acl(self): | ||
settings = { | ||
'AWS_ACCESS_KEY_ID': 'access_key', | ||
'AWS_SECRET_ACCESS_KEY': 'secret_key', | ||
'FEED_STORAGE_S3_ACL': 'custom-acl', | ||
} | ||
crawler = get_crawler(settings_dict=settings) | ||
storage = S3FeedStorage.from_crawler( | ||
crawler, | ||
's3://mybucket/export.csv' | ||
) | ||
self.assertEqual(storage.access_key, 'access_key') | ||
self.assertEqual(storage.secret_key, 'secret_key') | ||
self.assertEqual(storage.acl, 'custom-acl') | ||
|
||
@defer.inlineCallbacks | ||
def test_store_botocore_without_acl(self): | ||
storage = S3FeedStorage( | ||
's3://mybucket/export.csv', | ||
'access_key', | ||
'secret_key', | ||
) | ||
self.assertEqual(storage.access_key, 'access_key') | ||
self.assertEqual(storage.secret_key, 'secret_key') | ||
self.assertEqual(storage.acl, None) | ||
|
||
with mock.patch('botocore.client.BaseClient._make_api_call') as m: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The heavy use of mocks worries me, especially as we're mocking a private method; tbh I'd prefer not to use mocks at all, and write tests in some other way (including having less of them). On the other hand, checking for exact operation names like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that you pointed it's also bothering me. I have just refactored the tests not to use private methods. Thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just writing a thought here. Regarding mocks, no problems, just think we should be careful when using them. 😄 |
||
yield storage.store(BytesIO(b'test file')) | ||
|
||
operation_name, api_params = m.call_args[0] | ||
self.assertEqual(operation_name, 'PutObject') | ||
self.assertNotIn('ACL', api_params) | ||
|
||
@defer.inlineCallbacks | ||
def test_store_botocore_with_acl(self): | ||
storage = S3FeedStorage( | ||
's3://mybucket/export.csv', | ||
'access_key', | ||
'secret_key', | ||
'custom-acl' | ||
) | ||
self.assertEqual(storage.access_key, 'access_key') | ||
self.assertEqual(storage.secret_key, 'secret_key') | ||
self.assertEqual(storage.acl, 'custom-acl') | ||
|
||
with mock.patch('botocore.client.BaseClient._make_api_call') as m: | ||
yield storage.store(BytesIO(b'test file')) | ||
|
||
operation_name, api_params = m.call_args[0] | ||
self.assertEqual(operation_name, 'PutObject') | ||
self.assertEqual(api_params.get('ACL'), 'custom-acl') | ||
|
||
@defer.inlineCallbacks | ||
def test_store_not_botocore_without_acl(self): | ||
storage = S3FeedStorage( | ||
's3://mybucket/export.csv', | ||
'access_key', | ||
'secret_key', | ||
) | ||
self.assertEqual(storage.access_key, 'access_key') | ||
self.assertEqual(storage.secret_key, 'secret_key') | ||
self.assertEqual(storage.acl, None) | ||
|
||
storage.is_botocore = False | ||
storage.connect_s3 = mock.MagicMock() | ||
self.assertFalse(storage.is_botocore) | ||
|
||
yield storage.store(BytesIO(b'test file')) | ||
|
||
conn = storage.connect_s3(*storage.connect_s3.call_args) | ||
bucket = conn.get_bucket(*conn.get_bucket.call_args) | ||
key = bucket.new_key(*bucket.new_key.call_args) | ||
self.assertNotIn( | ||
dict(policy='custom-acl'), | ||
key.set_contents_from_file.call_args | ||
) | ||
|
||
@defer.inlineCallbacks | ||
def test_store_not_botocore_with_acl(self): | ||
storage = S3FeedStorage( | ||
's3://mybucket/export.csv', | ||
'access_key', | ||
'secret_key', | ||
'custom-acl' | ||
) | ||
self.assertEqual(storage.access_key, 'access_key') | ||
self.assertEqual(storage.secret_key, 'secret_key') | ||
self.assertEqual(storage.acl, 'custom-acl') | ||
|
||
storage.is_botocore = False | ||
storage.connect_s3 = mock.MagicMock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know. I decided to go with this simple method because I just would like to guarantee we're calling the If we change the method names, the code would break because the mocks wouldn't have been called from the test. Let me know if I am missing anything. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is fine for me, the main concern was if the API (method names) change, the unit test won't catch these errors and we'll notice them in production or so. But, if we |
||
self.assertFalse(storage.is_botocore) | ||
|
||
yield storage.store(BytesIO(b'test file')) | ||
|
||
conn = storage.connect_s3(*storage.connect_s3.call_args) | ||
bucket = conn.get_bucket(*conn.get_bucket.call_args) | ||
key = bucket.new_key(*bucket.new_key.call_args) | ||
self.assertIn( | ||
dict(policy='custom-acl'), | ||
key.set_contents_from_file.call_args | ||
) | ||
|
||
|
||
class StdoutFeedStorageTest(unittest.TestCase): | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ deps = | |
lxml==3.4.0 | ||
Twisted==14.0.2 | ||
boto==2.34.0 | ||
botocore==1.12.89 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should be dropping jessie support, but if we're keeping it, botocore should be 0.62 (see https://packages.debian.org/en/jessie/python-botocore) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated tox.ini. Waiting for tests to pass. Thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. botocore version supported by debian jessie does not contain required methods... I'll just skip this test when in jessie environments. |
||
Pillow==2.6.1 | ||
cssselect==0.9.1 | ||
zope.interface==4.1.1 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should consider changing the name to
FEED_ACL
, and follow an approach similar to other feed storage settings here such asFEED_URI
: The backend-specific documentation is provided on the backend-specific documentation section.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I see this approach is already followed in other places. I guess any change to that is then a different beast to treat in a different changeset. Lets merge.