Skip to content

Commit

Permalink
Merge pull request #3199 from rhoboro/gcs_acl
Browse files Browse the repository at this point in the history
[MRG+1] FilesPipeline supports ACL for Google Cloud Storage
  • Loading branch information
lopuhin committed Apr 25, 2018
2 parents da1256a + 6ef6585 commit f36e1b5
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 3 deletions.
15 changes: 15 additions & 0 deletions docs/topics/media-pipeline.rst
Expand Up @@ -189,6 +189,8 @@ Google Cloud Storage
---------------------

.. setting:: GCS_PROJECT_ID
.. setting:: FILES_STORE_GCS_ACL
.. setting:: IMAGES_STORE_GCS_ACL

:setting:`FILES_STORE` and :setting:`IMAGES_STORE` can represent a Google Cloud Storage
bucket. Scrapy will automatically upload the files to the bucket. (requires `google-cloud-storage`_ )
Expand All @@ -204,6 +206,19 @@ For information about authentication, see this `documentation`_.

.. _documentation: https://cloud.google.com/docs/authentication/production

You can modify the Access Control List (ACL) policy used for the stored files,
which is defined by the :setting:`FILES_STORE_GCS_ACL` and
:setting:`IMAGES_STORE_GCS_ACL` settings. By default, the ACL is set to
``''`` (empty string) which means that Cloud Storage applies the bucket's default object ACL to the object.
To make the files publicly available use the ``publicRead``
policy::

IMAGES_STORE_GCS_ACL = 'publicRead'

For more information, see `Predefined ACLs`_ in the Google Cloud Platform Developer Guide.

.. _Predefined ACLs: https://cloud.google.com/storage/docs/access-control/lists#predefined-acl

Usage example
=============

Expand Down
8 changes: 7 additions & 1 deletion scrapy/pipelines/files.py
Expand Up @@ -208,6 +208,10 @@ class GCSFilesStore(object):

CACHE_CONTROL = 'max-age=172800'

# The bucket's default object ACL will be applied to the object.
# Overriden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
POLICY = None

def __init__(self, uri):
from google.cloud import storage
client = storage.Client(project=self.GCS_PROJECT_ID)
Expand Down Expand Up @@ -239,7 +243,8 @@ def persist_file(self, path, buf, info, meta=None, headers=None):
return threads.deferToThread(
blob.upload_from_string,
data=buf.getvalue(),
content_type=self._get_content_type(headers)
content_type=self._get_content_type(headers),
predefined_acl=self.POLICY
)


Expand Down Expand Up @@ -314,6 +319,7 @@ def from_settings(cls, settings):

gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['FILES_STORE_GCS_ACL'] or None

store_uri = settings['FILES_STORE']
return cls(store_uri, settings=settings)
Expand Down
1 change: 1 addition & 0 deletions scrapy/pipelines/images.py
Expand Up @@ -93,6 +93,7 @@ def from_settings(cls, settings):

gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None

store_uri = settings['IMAGES_STORE']
return cls(store_uri, settings=settings)
Expand Down
2 changes: 2 additions & 0 deletions scrapy/settings/default_settings.py
Expand Up @@ -159,6 +159,7 @@
FEED_EXPORT_INDENT = 0

FILES_STORE_S3_ACL = 'private'
FILES_STORE_GCS_ACL = ''

FTP_USER = 'anonymous'
FTP_PASSWORD = 'guest'
Expand All @@ -181,6 +182,7 @@
HTTPPROXY_AUTH_ENCODING = 'latin-1'

IMAGES_STORE_S3_ACL = 'private'
IMAGES_STORE_GCS_ACL = ''

ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager'

Expand Down
3 changes: 2 additions & 1 deletion scrapy/utils/test.py
Expand Up @@ -57,8 +57,9 @@ def get_gcs_content_and_delete(bucket, path):
bucket = client.get_bucket(bucket)
blob = bucket.get_blob(path)
content = blob.download_as_string()
acl = list(blob.acl) # loads acl before it will be deleted
bucket.delete_blob(path)
return content, blob
return content, acl, blob

def get_crawler(spidercls=None, settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
Expand Down
5 changes: 4 additions & 1 deletion tests/test_pipeline_files.py
Expand Up @@ -388,17 +388,20 @@ def test_persist(self):
meta = {'foo': 'bar'}
path = 'full/filename'
store = GCSFilesStore(uri)
store.POLICY = 'authenticatedRead'
expected_policy = {'role': 'READER', 'entity': 'allAuthenticatedUsers'}
yield store.persist_file(path, buf, info=None, meta=meta, headers=None)
s = yield store.stat_file(path, info=None)
self.assertIn('last_modified', s)
self.assertIn('checksum', s)
self.assertEqual(s['checksum'], 'zc2oVgXkbQr2EQdSdw3OPA==')
u = urlparse(uri)
content, blob = get_gcs_content_and_delete(u.hostname, u.path[1:]+path)
content, acl, blob = get_gcs_content_and_delete(u.hostname, u.path[1:]+path)
self.assertEqual(content, data)
self.assertEqual(blob.metadata, {'foo': 'bar'})
self.assertEqual(blob.cache_control, GCSFilesStore.CACHE_CONTROL)
self.assertEqual(blob.content_type, 'application/octet-stream')
self.assertIn(expected_policy, acl)


class ItemWithFiles(Item):
Expand Down

0 comments on commit f36e1b5

Please sign in to comment.