From 013568097db04396d780d1c91d37027115af7fe2 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 29 Jan 2019 11:10:06 -0300 Subject: [PATCH 01/20] add FEED_STORAGE_S3_ACL setting --- scrapy/extensions/feedexport.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 22ebf3b3f2a..eb080226154 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -118,6 +118,7 @@ def __init__(self, uri, access_key=None, secret_key=None): self.secret_key = u.password or secret_key self.is_botocore = is_botocore() self.keyname = u.path[1:] # remove first "/" + self.policy = settings.get('FEED_STORAGE_S3_ACL', 'private') if self.is_botocore: import botocore.session session = botocore.session.get_session() @@ -137,12 +138,13 @@ def _store_in_thread(self, file): file.seek(0) if self.is_botocore: self.s3_client.put_object( - Bucket=self.bucketname, Key=self.keyname, Body=file) + Bucket=self.bucketname, Key=self.keyname, Body=file, + ACL=self.policy) else: conn = self.connect_s3(self.access_key, self.secret_key) bucket = conn.get_bucket(self.bucketname, validate=False) key = bucket.new_key(self.keyname) - key.set_contents_from_file(file) + key.set_contents_from_file(file, policy=self.policy) key.close() From ad83ffdf1f4d69ffb62b243429e7b59d0930524c Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 6 Feb 2019 18:32:46 -0200 Subject: [PATCH 02/20] refactoring --- scrapy/extensions/feedexport.py | 19 ++++++-- tests/test_feedexport.py | 84 +++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 5 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index eb080226154..ca30322be08 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -93,7 +93,7 @@ def store(self, file): class S3FeedStorage(BlockingFeedStorage): - def __init__(self, uri, access_key=None, secret_key=None): + def __init__(self, uri, access_key=None, secret_key=None, acl=None): # BEGIN Backwards compatibility for initialising without keys (and # without using from_crawler) no_defaults = access_key is None and secret_key is None @@ -118,7 +118,7 @@ def __init__(self, uri, access_key=None, secret_key=None): self.secret_key = u.password or secret_key self.is_botocore = is_botocore() self.keyname = u.path[1:] # remove first "/" - self.policy = settings.get('FEED_STORAGE_S3_ACL', 'private') + self.acl = acl if self.is_botocore: import botocore.session session = botocore.session.get_session() @@ -132,19 +132,28 @@ def __init__(self, uri, access_key=None, secret_key=None): @classmethod def from_crawler(cls, crawler, uri): return cls(uri, crawler.settings['AWS_ACCESS_KEY_ID'], - crawler.settings['AWS_SECRET_ACCESS_KEY']) + crawler.settings['AWS_SECRET_ACCESS_KEY'], + crawler.settings.get('FEED_STORAGE_S3_ACL')) def _store_in_thread(self, file): file.seek(0) if self.is_botocore: + kwargs = dict() + if self.acl: + kwargs.update(dict(ACL=self.acl)) + self.s3_client.put_object( Bucket=self.bucketname, Key=self.keyname, Body=file, - ACL=self.policy) + **kwargs) else: conn = self.connect_s3(self.access_key, self.secret_key) bucket = conn.get_bucket(self.bucketname, validate=False) key = bucket.new_key(self.keyname) - key.set_contents_from_file(file, policy=self.policy) + kwargs = dict() + if self.acl: + kwargs.update(dict(policy=self.acl)) + + key.set_contents_from_file(file, **kwargs) key.close() diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index e46c8c14eb9..b07635cb04c 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -18,6 +18,7 @@ from tests.mockserver import MockServer from w3lib.url import path_to_file_uri +import botocore.client import scrapy from scrapy.exporters import CsvItemExporter from scrapy.extensions.feedexport import ( @@ -186,6 +187,89 @@ def test_store(self): content = get_s3_content_and_delete(u.hostname, u.path[1:]) self.assertEqual(content, expected_content) + def test_init_without_acl(self): + storage = S3FeedStorage( + 's3://mybucket/export.csv', + 'access_key', + 'secret_key' + ) + self.assertEqual(storage.access_key, 'access_key') + self.assertEqual(storage.secret_key, 'secret_key') + self.assertEqual(storage.acl, None) + + def test_init_with_acl(self): + storage = S3FeedStorage( + 's3://mybucket/export.csv', + 'access_key', + 'secret_key', + 'custom-acl' + ) + self.assertEqual(storage.access_key, 'access_key') + self.assertEqual(storage.secret_key, 'secret_key') + self.assertEqual(storage.acl, 'custom-acl') + + def test_from_crawler_without_acl(self): + settings = { + 'AWS_ACCESS_KEY_ID': 'access_key', + 'AWS_SECRET_ACCESS_KEY': 'secret_key', + } + crawler = get_crawler(settings_dict=settings) + storage = S3FeedStorage.from_crawler( + crawler, + 's3://mybucket/export.csv' + ) + self.assertEqual(storage.access_key, 'access_key') + self.assertEqual(storage.secret_key, 'secret_key') + self.assertEqual(storage.acl, None) + + def test_from_crawler_with_acl(self): + settings = { + 'AWS_ACCESS_KEY_ID': 'access_key', + 'AWS_SECRET_ACCESS_KEY': 'secret_key', + 'FEED_STORAGE_S3_ACL': 'custom-acl', + } + crawler = get_crawler(settings_dict=settings) + storage = S3FeedStorage.from_crawler( + crawler, + 's3://mybucket/export.csv' + ) + self.assertEqual(storage.access_key, 'access_key') + self.assertEqual(storage.secret_key, 'secret_key') + self.assertEqual(storage.acl, 'custom-acl') + + def test_store_in_thread_without_acl(self): + storage = S3FeedStorage( + 's3://mybucket/export.csv', + 'access_key', + 'secret_key', + ) + self.assertEqual(storage.access_key, 'access_key') + self.assertEqual(storage.secret_key, 'secret_key') + self.assertEqual(storage.acl, None) + + with mock.patch('botocore.client.BaseClient._make_api_call') as _make_api_call_mock: + storage._store_in_thread(BytesIO(b'test file')) + operation_name, api_params = _make_api_call_mock.call_args[0] + self.assertEqual(operation_name, 'PutObject') + self.assertNotIn('ACL', api_params) + + def test_store_in_thread_with_acl(self): + storage = S3FeedStorage( + 's3://mybucket/export.csv', + 'access_key', + 'secret_key', + 'custom-acl' + ) + self.assertEqual(storage.access_key, 'access_key') + self.assertEqual(storage.secret_key, 'secret_key') + self.assertEqual(storage.acl, 'custom-acl') + + with mock.patch('botocore.client.BaseClient._make_api_call') as _make_api_call_mock: + storage._store_in_thread(BytesIO(b'test file')) + operation_name, api_params = _make_api_call_mock.call_args[0] + self.assertEqual(operation_name, 'PutObject') + self.assertEqual(api_params.get('ACL'), 'custom-acl') + class StdoutFeedStorageTest(unittest.TestCase): From 126207fb7bca21d3d95ed9c66028e82771180370 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 6 Feb 2019 18:38:17 -0200 Subject: [PATCH 03/20] PEP8: use short name for mock method --- tests/test_feedexport.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index b07635cb04c..bfac06efc55 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -247,9 +247,9 @@ def test_store_in_thread_without_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, None) - with mock.patch('botocore.client.BaseClient._make_api_call') as _make_api_call_mock: + with mock.patch('botocore.client.BaseClient._make_api_call') as m: storage._store_in_thread(BytesIO(b'test file')) - operation_name, api_params = _make_api_call_mock.call_args[0] + operation_name, api_params = m.call_args[0] self.assertEqual(operation_name, 'PutObject') self.assertNotIn('ACL', api_params) @@ -264,9 +264,9 @@ def test_store_in_thread_with_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, 'custom-acl') - with mock.patch('botocore.client.BaseClient._make_api_call') as _make_api_call_mock: + with mock.patch('botocore.client.BaseClient._make_api_call') as m: storage._store_in_thread(BytesIO(b'test file')) - operation_name, api_params = _make_api_call_mock.call_args[0] + operation_name, api_params = m.call_args[0] self.assertEqual(operation_name, 'PutObject') self.assertEqual(api_params.get('ACL'), 'custom-acl') From e0f34be383e361c75b22da59c97dee1db189937e Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 6 Feb 2019 18:50:19 -0200 Subject: [PATCH 04/20] update docs --- docs/topics/feed-exports.rst | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index b64dbfbfdd9..661751ed9ac 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -185,6 +185,10 @@ passed through the following settings: * :setting:`AWS_ACCESS_KEY_ID` * :setting:`AWS_SECRET_ACCESS_KEY` +You can also define a custom ACL for exported objects using this setting: + + * :setting:`FEED_STORAGE_S3_ACL` + .. _topics-feed-storage-stdout: Standard output @@ -205,6 +209,7 @@ These are the settings used for configuring the feed exports: * :setting:`FEED_URI` (mandatory) * :setting:`FEED_FORMAT` * :setting:`FEED_STORAGES` + * :setting:`FEED_STORAGE_S3_ACL` * :setting:`FEED_EXPORTERS` * :setting:`FEED_STORE_EMPTY` * :setting:`FEED_EXPORT_ENCODING` @@ -302,11 +307,22 @@ Default: ``{}`` A dict containing additional feed storage backends supported by your project. The keys are URI schemes and the values are paths to storage classes. +.. setting:: FEED_STORAGE_S3_ACL + +FEED_STORAGE_S3_ACL +------------------- + +Default: ``None`` + +A string containing a custom ACL for feeds exported to Amazon S3 by your project. + +For a complete list of available values, access the `Canned ACL`_ section on Amazon S3 docs. + .. setting:: FEED_STORAGES_BASE FEED_STORAGES_BASE ------------------ - +` Default:: { @@ -366,3 +382,4 @@ format in :setting:`FEED_EXPORTERS`. E.g., to disable the built-in CSV exporter .. _Amazon S3: https://aws.amazon.com/s3/ .. _boto: https://github.com/boto/boto .. _botocore: https://github.com/boto/botocore +.. _Canned ACL: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl From 7b83ed7c5e1fcd81baf50db3a76f10ade7aa226e Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 6 Feb 2019 18:52:24 -0200 Subject: [PATCH 05/20] remove typo --- docs/topics/feed-exports.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 661751ed9ac..25979dfef80 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -322,7 +322,7 @@ For a complete list of available values, access the `Canned ACL`_ section on Ama FEED_STORAGES_BASE ------------------ -` + Default:: { From e25b9a2323c169a4032ff07f912299b32de4b2e0 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 6 Feb 2019 18:52:39 -0200 Subject: [PATCH 06/20] calling it feeds instead of objects --- docs/topics/feed-exports.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index 25979dfef80..dee0c3ffa7a 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -185,7 +185,7 @@ passed through the following settings: * :setting:`AWS_ACCESS_KEY_ID` * :setting:`AWS_SECRET_ACCESS_KEY` -You can also define a custom ACL for exported objects using this setting: +You can also define a custom ACL for exported feeds using this setting: * :setting:`FEED_STORAGE_S3_ACL` From dbeb088eea1713ac43f3d23579c36ece5f67563f Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Thu, 7 Feb 2019 09:29:16 -0200 Subject: [PATCH 07/20] trying to fix jessie testenv by adding botocore to requirements and fixing its version --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 0c0f8f7b7d5..f2f3e1293d0 100644 --- a/tox.ini +++ b/tox.ini @@ -47,6 +47,7 @@ deps = lxml==3.4.0 Twisted==14.0.2 boto==2.34.0 + botocore==1.12.89 Pillow==2.6.1 cssselect==0.9.1 zope.interface==4.1.1 From 079af889e7d010a79640e6874c3e6dc394b936ae Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Thu, 7 Feb 2019 10:42:59 -0200 Subject: [PATCH 08/20] also testing without botocore --- tests/test_feedexport.py | 53 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index bfac06efc55..520ca4a8f2b 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -237,7 +237,7 @@ def test_from_crawler_with_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, 'custom-acl') - def test_store_in_thread_without_acl(self): + def test_store_in_thread_botocore_without_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -253,7 +253,7 @@ def test_store_in_thread_without_acl(self): self.assertEqual(operation_name, 'PutObject') self.assertNotIn('ACL', api_params) - def test_store_in_thread_with_acl(self): + def test_store_in_thread_botocore_with_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -270,6 +270,55 @@ def test_store_in_thread_with_acl(self): self.assertEqual(operation_name, 'PutObject') self.assertEqual(api_params.get('ACL'), 'custom-acl') + def test_store_in_thread_not_botocore_without_acl(self): + storage = S3FeedStorage( + 's3://mybucket/export.csv', + 'access_key', + 'secret_key', + ) + self.assertEqual(storage.access_key, 'access_key') + self.assertEqual(storage.secret_key, 'secret_key') + self.assertEqual(storage.acl, None) + + storage.is_botocore = False + storage.connect_s3 = mock.MagicMock() + self.assertFalse(storage.is_botocore) + + storage._store_in_thread(BytesIO(b'test file')) + + conn = storage.connect_s3(*storage.connect_s3.call_args) + bucket = conn.get_bucket(*conn.get_bucket.call_args) + key = bucket.new_key(*bucket.new_key.call_args) + self.assertNotIn( + dict(policy='custom-acl'), + key.set_contents_from_file.call_args + ) + + def test_store_in_thread_not_botocore_with_acl(self): + storage = S3FeedStorage( + 's3://mybucket/export.csv', + 'access_key', + 'secret_key', + 'custom-acl' + ) + self.assertEqual(storage.access_key, 'access_key') + self.assertEqual(storage.secret_key, 'secret_key') + self.assertEqual(storage.acl, 'custom-acl') + + storage.is_botocore = False + storage.connect_s3 = mock.MagicMock() + self.assertFalse(storage.is_botocore) + + storage._store_in_thread(BytesIO(b'test file')) + + conn = storage.connect_s3(*storage.connect_s3.call_args) + bucket = conn.get_bucket(*conn.get_bucket.call_args) + key = bucket.new_key(*bucket.new_key.call_args) + self.assertIn( + dict(policy='custom-acl'), + key.set_contents_from_file.call_args + ) + class StdoutFeedStorageTest(unittest.TestCase): From ceae356e62dc2e56465a58b3d9fe00813e289dd6 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 8 Feb 2019 11:47:35 -0200 Subject: [PATCH 09/20] add FEED_STORAGE_S3_ACL to default_settings.py file --- scrapy/settings/default_settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 3734a0a5868..776c5af23aa 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -158,6 +158,8 @@ } FEED_EXPORT_INDENT = 0 +FEED_STORAGE_S3_ACL = None + FILES_STORE_S3_ACL = 'private' FILES_STORE_GCS_ACL = '' From cfd183a9d19563f09487af942c9a635d665a1905 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 8 Feb 2019 14:49:26 -0200 Subject: [PATCH 10/20] no need to use get here since we're defining a default value in default_settings.py --- scrapy/extensions/feedexport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index ca30322be08..2b4594ad811 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -133,7 +133,7 @@ def __init__(self, uri, access_key=None, secret_key=None, acl=None): def from_crawler(cls, crawler, uri): return cls(uri, crawler.settings['AWS_ACCESS_KEY_ID'], crawler.settings['AWS_SECRET_ACCESS_KEY'], - crawler.settings.get('FEED_STORAGE_S3_ACL')) + crawler.settings['FEED_STORAGE_S3_ACL']) def _store_in_thread(self, file): file.seek(0) From f824f5b2d17b082dac04505ba27afdfa869a11c7 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 8 Feb 2019 15:19:57 -0200 Subject: [PATCH 11/20] testing public method store instead of private method _store_in_thread need to mock deferToThread function --- tests/test_feedexport.py | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 520ca4a8f2b..e8c32ea43a9 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -237,7 +237,7 @@ def test_from_crawler_with_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, 'custom-acl') - def test_store_in_thread_botocore_without_acl(self): + def test_store_botocore_without_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -247,13 +247,19 @@ def test_store_in_thread_botocore_without_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, None) + def _defer(f, *args, **kwargs): + return f(*args, **kwargs) + with mock.patch('botocore.client.BaseClient._make_api_call') as m: - storage._store_in_thread(BytesIO(b'test file')) + with mock.patch('twisted.internet.threads.deferToThread', + new=_defer): + storage.store(BytesIO(b'test file')) + operation_name, api_params = m.call_args[0] self.assertEqual(operation_name, 'PutObject') self.assertNotIn('ACL', api_params) - def test_store_in_thread_botocore_with_acl(self): + def test_store_botocore_with_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -264,13 +270,19 @@ def test_store_in_thread_botocore_with_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, 'custom-acl') + def _defer(f, *args, **kwargs): + return f(*args, **kwargs) + with mock.patch('botocore.client.BaseClient._make_api_call') as m: - storage._store_in_thread(BytesIO(b'test file')) + with mock.patch('twisted.internet.threads.deferToThread', + new=_defer): + storage.store(BytesIO(b'test file')) + operation_name, api_params = m.call_args[0] self.assertEqual(operation_name, 'PutObject') self.assertEqual(api_params.get('ACL'), 'custom-acl') - def test_store_in_thread_not_botocore_without_acl(self): + def test_store_not_botocore_without_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -284,7 +296,11 @@ def test_store_in_thread_not_botocore_without_acl(self): storage.connect_s3 = mock.MagicMock() self.assertFalse(storage.is_botocore) - storage._store_in_thread(BytesIO(b'test file')) + def _defer(f, *args, **kwargs): + return f(*args, **kwargs) + + with mock.patch('twisted.internet.threads.deferToThread', new=_defer): + storage.store(BytesIO(b'test file')) conn = storage.connect_s3(*storage.connect_s3.call_args) bucket = conn.get_bucket(*conn.get_bucket.call_args) @@ -294,7 +310,7 @@ def test_store_in_thread_not_botocore_without_acl(self): key.set_contents_from_file.call_args ) - def test_store_in_thread_not_botocore_with_acl(self): + def test_store_not_botocore_with_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -309,7 +325,11 @@ def test_store_in_thread_not_botocore_with_acl(self): storage.connect_s3 = mock.MagicMock() self.assertFalse(storage.is_botocore) - storage._store_in_thread(BytesIO(b'test file')) + def _defer(f, *args, **kwargs): + return f(*args, **kwargs) + + with mock.patch('twisted.internet.threads.deferToThread', new=_defer): + storage.store(BytesIO(b'test file')) conn = storage.connect_s3(*storage.connect_s3.call_args) bucket = conn.get_bucket(*conn.get_bucket.call_args) From 1eac2a163c2c734594d4f1e7e026eab309b2b0b5 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 8 Feb 2019 16:50:39 -0200 Subject: [PATCH 12/20] simplifying how we deal with threads.deferToThread calls --- tests/test_feedexport.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index e8c32ea43a9..0f31ef00ec2 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -237,6 +237,7 @@ def test_from_crawler_with_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, 'custom-acl') + @defer.inlineCallbacks def test_store_botocore_without_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', @@ -247,18 +248,14 @@ def test_store_botocore_without_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, None) - def _defer(f, *args, **kwargs): - return f(*args, **kwargs) - with mock.patch('botocore.client.BaseClient._make_api_call') as m: - with mock.patch('twisted.internet.threads.deferToThread', - new=_defer): - storage.store(BytesIO(b'test file')) + yield storage.store(BytesIO(b'test file')) operation_name, api_params = m.call_args[0] self.assertEqual(operation_name, 'PutObject') self.assertNotIn('ACL', api_params) + @defer.inlineCallbacks def test_store_botocore_with_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', @@ -270,18 +267,14 @@ def test_store_botocore_with_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, 'custom-acl') - def _defer(f, *args, **kwargs): - return f(*args, **kwargs) - with mock.patch('botocore.client.BaseClient._make_api_call') as m: - with mock.patch('twisted.internet.threads.deferToThread', - new=_defer): - storage.store(BytesIO(b'test file')) + yield storage.store(BytesIO(b'test file')) operation_name, api_params = m.call_args[0] self.assertEqual(operation_name, 'PutObject') self.assertEqual(api_params.get('ACL'), 'custom-acl') + @defer.inlineCallbacks def test_store_not_botocore_without_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', @@ -296,11 +289,7 @@ def test_store_not_botocore_without_acl(self): storage.connect_s3 = mock.MagicMock() self.assertFalse(storage.is_botocore) - def _defer(f, *args, **kwargs): - return f(*args, **kwargs) - - with mock.patch('twisted.internet.threads.deferToThread', new=_defer): - storage.store(BytesIO(b'test file')) + yield storage.store(BytesIO(b'test file')) conn = storage.connect_s3(*storage.connect_s3.call_args) bucket = conn.get_bucket(*conn.get_bucket.call_args) @@ -310,6 +299,7 @@ def _defer(f, *args, **kwargs): key.set_contents_from_file.call_args ) + @defer.inlineCallbacks def test_store_not_botocore_with_acl(self): storage = S3FeedStorage( 's3://mybucket/export.csv', @@ -325,11 +315,7 @@ def test_store_not_botocore_with_acl(self): storage.connect_s3 = mock.MagicMock() self.assertFalse(storage.is_botocore) - def _defer(f, *args, **kwargs): - return f(*args, **kwargs) - - with mock.patch('twisted.internet.threads.deferToThread', new=_defer): - storage.store(BytesIO(b'test file')) + yield storage.store(BytesIO(b'test file')) conn = storage.connect_s3(*storage.connect_s3.call_args) bucket = conn.get_bucket(*conn.get_bucket.call_args) From 7c9f0bd86c5f02ea803fa6bf1242d34d9c9f47d5 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 12 Feb 2019 12:19:30 -0200 Subject: [PATCH 13/20] using named params with optional amazon s3 params --- scrapy/extensions/feedexport.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 2b4594ad811..f6bc460ea32 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -131,9 +131,12 @@ def __init__(self, uri, access_key=None, secret_key=None, acl=None): @classmethod def from_crawler(cls, crawler, uri): - return cls(uri, crawler.settings['AWS_ACCESS_KEY_ID'], - crawler.settings['AWS_SECRET_ACCESS_KEY'], - crawler.settings['FEED_STORAGE_S3_ACL']) + return cls( + uri=uri, + access_key=crawler.settings['AWS_ACCESS_KEY_ID'], + secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'], + acl=crawler.settings['FEED_STORAGE_S3_ACL'] + ) def _store_in_thread(self, file): file.seek(0) From c2dede27bd56bd783c45fb7302ca06b7c2c025c0 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 12 Feb 2019 12:22:05 -0200 Subject: [PATCH 14/20] reduce code with simple ternary operator --- scrapy/extensions/feedexport.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index f6bc460ea32..40f985f19a6 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -141,10 +141,7 @@ def from_crawler(cls, crawler, uri): def _store_in_thread(self, file): file.seek(0) if self.is_botocore: - kwargs = dict() - if self.acl: - kwargs.update(dict(ACL=self.acl)) - + kwargs = {'ACL': self.acl} if self.acl else {} self.s3_client.put_object( Bucket=self.bucketname, Key=self.keyname, Body=file, **kwargs) @@ -152,10 +149,7 @@ def _store_in_thread(self, file): conn = self.connect_s3(self.access_key, self.secret_key) bucket = conn.get_bucket(self.bucketname, validate=False) key = bucket.new_key(self.keyname) - kwargs = dict() - if self.acl: - kwargs.update(dict(policy=self.acl)) - + kwargs = {'policy': self.acl} if self.acl else {} key.set_contents_from_file(file, **kwargs) key.close() From 984e706fd2e06457bcdd1226366d08950ac101b0 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 12 Feb 2019 12:26:57 -0200 Subject: [PATCH 15/20] using blank string instead of None as default value as proposed by @kmike --- docs/topics/feed-exports.rst | 2 +- scrapy/extensions/feedexport.py | 2 +- scrapy/settings/default_settings.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index dee0c3ffa7a..cf70b8acaca 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -312,7 +312,7 @@ The keys are URI schemes and the values are paths to storage classes. FEED_STORAGE_S3_ACL ------------------- -Default: ``None`` +Default: ``''`` (empty string) A string containing a custom ACL for feeds exported to Amazon S3 by your project. diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 40f985f19a6..975fa1229fd 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -135,7 +135,7 @@ def from_crawler(cls, crawler, uri): uri=uri, access_key=crawler.settings['AWS_ACCESS_KEY_ID'], secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'], - acl=crawler.settings['FEED_STORAGE_S3_ACL'] + acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None ) def _store_in_thread(self, file): diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 776c5af23aa..a800d39ab75 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -158,7 +158,7 @@ } FEED_EXPORT_INDENT = 0 -FEED_STORAGE_S3_ACL = None +FEED_STORAGE_S3_ACL = '' FILES_STORE_S3_ACL = 'private' FILES_STORE_GCS_ACL = '' From b4d132b9f0824263d83331d0b36870f6f64918e4 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 13 Feb 2019 19:21:14 -0200 Subject: [PATCH 16/20] setting botocore version as described in debian jessie website https://packages.debian.org/en/jessie/python-botocore --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index f2f3e1293d0..584da2dcd94 100644 --- a/tox.ini +++ b/tox.ini @@ -47,7 +47,7 @@ deps = lxml==3.4.0 Twisted==14.0.2 boto==2.34.0 - botocore==1.12.89 + botocore==0.62 Pillow==2.6.1 cssselect==0.9.1 zope.interface==4.1.1 From dc0b643832e9f3400e432c2ef7a34e6c75ac8366 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 13 Feb 2019 19:44:50 -0200 Subject: [PATCH 17/20] refactoring tests to avoid mocking private method --- tests/test_feedexport.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 0f31ef00ec2..c103593f951 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -248,12 +248,9 @@ def test_store_botocore_without_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, None) - with mock.patch('botocore.client.BaseClient._make_api_call') as m: - yield storage.store(BytesIO(b'test file')) - - operation_name, api_params = m.call_args[0] - self.assertEqual(operation_name, 'PutObject') - self.assertNotIn('ACL', api_params) + storage.s3_client = mock.MagicMock() + yield storage.store(BytesIO(b'test file')) + self.assertNotIn('ACL', storage.s3_client.put_object.call_args[1]) @defer.inlineCallbacks def test_store_botocore_with_acl(self): @@ -267,12 +264,12 @@ def test_store_botocore_with_acl(self): self.assertEqual(storage.secret_key, 'secret_key') self.assertEqual(storage.acl, 'custom-acl') - with mock.patch('botocore.client.BaseClient._make_api_call') as m: - yield storage.store(BytesIO(b'test file')) - - operation_name, api_params = m.call_args[0] - self.assertEqual(operation_name, 'PutObject') - self.assertEqual(api_params.get('ACL'), 'custom-acl') + storage.s3_client = mock.MagicMock() + yield storage.store(BytesIO(b'test file')) + self.assertEqual( + storage.s3_client.put_object.call_args[1].get('ACL'), + 'custom-acl' + ) @defer.inlineCallbacks def test_store_not_botocore_without_acl(self): From ea8be627d15aa6fe1beaf50fff666cbeb161d94d Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 13 Feb 2019 19:53:10 -0200 Subject: [PATCH 18/20] botocore is not supported on debian jessie --- tests/test_feedexport.py | 7 ++++++- tox.ini | 1 - 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index c103593f951..2bf57e278dc 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -18,7 +18,6 @@ from tests.mockserver import MockServer from w3lib.url import path_to_file_uri -import botocore.client import scrapy from scrapy.exporters import CsvItemExporter from scrapy.extensions.feedexport import ( @@ -239,6 +238,9 @@ def test_from_crawler_with_acl(self): @defer.inlineCallbacks def test_store_botocore_without_acl(self): + if os.getenv('TOX_ENV_NAME') == 'jessie': + raise unittest.SkipTest('botocore is not supported on jessie') + storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -254,6 +256,9 @@ def test_store_botocore_without_acl(self): @defer.inlineCallbacks def test_store_botocore_with_acl(self): + if os.getenv('TOX_ENV_NAME') == 'jessie': + raise unittest.SkipTest('botocore is not supported on jessie') + storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', diff --git a/tox.ini b/tox.ini index 584da2dcd94..0c0f8f7b7d5 100644 --- a/tox.ini +++ b/tox.ini @@ -47,7 +47,6 @@ deps = lxml==3.4.0 Twisted==14.0.2 boto==2.34.0 - botocore==0.62 Pillow==2.6.1 cssselect==0.9.1 zope.interface==4.1.1 From 9b8ba4c383df0f3029d1b07ab9647a7d902600f4 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Thu, 14 Feb 2019 16:20:56 -0200 Subject: [PATCH 19/20] try to import botocore before runing some tests --- tests/test_feedexport.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 2bf57e278dc..3ff79c9123c 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -238,8 +238,10 @@ def test_from_crawler_with_acl(self): @defer.inlineCallbacks def test_store_botocore_without_acl(self): - if os.getenv('TOX_ENV_NAME') == 'jessie': - raise unittest.SkipTest('botocore is not supported on jessie') + try: + import botocore + except ImportError: + raise unittest.SkipTest('botocore is required') storage = S3FeedStorage( 's3://mybucket/export.csv', @@ -256,8 +258,10 @@ def test_store_botocore_without_acl(self): @defer.inlineCallbacks def test_store_botocore_with_acl(self): - if os.getenv('TOX_ENV_NAME') == 'jessie': - raise unittest.SkipTest('botocore is not supported on jessie') + try: + import botocore + except ImportError: + raise unittest.SkipTest('botocore is required') storage = S3FeedStorage( 's3://mybucket/export.csv', From 9fed6fcb51fb58a74c10ae27f17e39d13c478846 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Thu, 14 Feb 2019 16:59:51 -0200 Subject: [PATCH 20/20] trigger tests