Skip to content

Commit

Permalink
Storage object deletion waits for consistency. (googledatalab#354)
Browse files Browse the repository at this point in the history
* Storage object deletion waits for consistency.

Fixes googledatalab#344.

GCS object deletions are eventually consistent, in that deleting an object in
a globally-available bucket may result in the object still appearing when
listing objects in the bucket:
  https://cloud.google.com/storage/docs/consistency#eventually_consistent_operations

This is a detail, but can lead to confusion for users: someone following the
examples in the tutorial notebooks may see rare-but-regular failures about an
object they just deleted continuing to exist. (We've seen this running the
notebooks as automated tests.) This PR teaches an object to poll the bucket
until it no longer appears. This is controllable by `wait_for_deletion`, and
defaults to **on**.

I've added two tests: first is a mock-based unit test, which works; the second
is intended as a first step towards integration tests. In this case, we assume
that the user running the test has ambient credentials that `google.datalab`
will discover.
  • Loading branch information
craigcitro authored and yebrahim committed Apr 14, 2017
1 parent f1522f7 commit 3c7e44d
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 2 deletions.
25 changes: 24 additions & 1 deletion google/datalab/storage/_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from builtins import object

import dateutil.parser
import logging
import time

import google.datalab
import google.datalab.utils
Expand All @@ -25,6 +27,12 @@
# TODO(nikhilko): Read/write operations don't account for larger files, or non-textual content.
# Use streaming reads into a buffer or StringIO or into a file handle.

# In some polling operations, we sleep between API calls to avoid hammering the
# server. This argument controls how long we sleep between API calls.
_POLLING_SLEEP = 1
# This argument controls how many times we'll poll before giving up.
_MAX_POLL_ATTEMPTS = 30


class ObjectMetadata(object):
"""Represents metadata about a Cloud Storage object."""
Expand Down Expand Up @@ -136,9 +144,13 @@ def exists(self):
except Exception as e:
raise e

def delete(self):
def delete(self, wait_for_deletion=True):
"""Deletes this object from its bucket.
Args:
wait_for_deletion: If True, we poll until this object no longer appears in
objects.list operations for this bucket before returning.
Raises:
Exception if there was an error deleting the object.
"""
Expand All @@ -147,6 +159,17 @@ def delete(self):
self._api.objects_delete(self._bucket, self._key)
except Exception as e:
raise e
if wait_for_deletion:
for _ in range(_MAX_POLL_ATTEMPTS):
objects = Objects(self._bucket, prefix=self.key, delimiter='/',
context=self._context)
if any(o.key == self.key for o in objects):
time.sleep(_POLLING_SLEEP)
continue
break
else:
logging.error('Failed to see object deletion after %d attempts.',
_MAX_POLL_ATTEMPTS)

@property
def metadata(self):
Expand Down
37 changes: 37 additions & 0 deletions tests/integration/storage_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Integration tests for google.datalab.storage."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging
import random
import string
import unittest

import google.datalab
from google.datalab import storage


class StorageTest(unittest.TestCase):

def setUp(self):
self._context = google.datalab.Context.default()
logging.info('Using project: %s', self._context.project_id)

suffix = ''.join(random.choice(string.lowercase) for _ in range(8))
self._test_bucket_name = '{}-{}'.format(self._context.project_id, suffix)
logging.info('test bucket: %s', self._test_bucket_name)

def test_object_deletion_consistency(self):
b = storage.Bucket(self._test_bucket_name, context=self._context)
b.create()
o = b.object('sample')
o.write_stream('contents', 'text/plain')
o.delete()
b.delete()


if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
unittest.main()
5 changes: 4 additions & 1 deletion tests/kernel/storage_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ def test_gcs_create(self, mock_context_default, mock_api_buckets_insert):
self.assertEqual("Couldn't create gs://foo/bar: Invalid bucket name gs://foo/bar",
str(error.exception))

@mock.patch('google.datalab.storage._api.Api.objects_list', autospec=True)
@mock.patch('google.datalab.storage._api.Api.buckets_get', autospec=True)
@mock.patch('google.datalab.storage._api.Api.objects_get', autospec=True)
@mock.patch('google.datalab.storage._bucket.Bucket.objects', autospec=True)
Expand All @@ -164,14 +165,16 @@ def test_gcs_create(self, mock_context_default, mock_api_buckets_insert):
@mock.patch('google.datalab.Context.default')
def test_gcs_delete(self, mock_context_default, mock_api_bucket_delete,
mock_api_objects_delete, mock_bucket_objects, mock_api_objects_get,
mock_api_buckets_get):
mock_api_buckets_get, mock_api_objects_list):
context = TestCases._create_context()
mock_context_default.return_value = context
# Mock API for getting objects in a bucket.
mock_bucket_objects.side_effect = TestCases._mock_bucket_objects_return(context)
# Mock API for getting object metadata.
mock_api_objects_get.side_effect = TestCases._mock_api_objects_get()
mock_api_buckets_get.side_effect = TestCases._mock_api_buckets_get()
# Mock API for listing objects in a bucket.
mock_api_objects_list.side_effect = {}

with self.assertRaises(Exception) as error:
google.datalab.storage.commands._storage._gcs_delete({
Expand Down
33 changes: 33 additions & 0 deletions tests/storage/object_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,39 @@ def test_enumerate_objects_multi_page(self, mock_api_objects):
self.assertEqual(objects[0].key, 'test_object1')
self.assertEqual(objects[1].key, 'test_object2')

@mock.patch('google.datalab.storage._api.Api.objects_list')
def test_object_delete_with_wait(self, mock_objects_list):
stable_object_name = 'testobject'
object_to_delete = 'temporaryobject'
mock_objects_list.side_effect = [
{'items': [{'name': stable_object_name}], 'nextPageToken': 'yes'},
{'items': [{'name': object_to_delete}]},
{'items': [{'name': stable_object_name}]},
]

b = TestCases._create_bucket()
o = b.object(object_to_delete)
o._info = {'name': object_to_delete}

with mock.patch.object(google.datalab.storage._api.Api, 'objects_delete',
autospec=True) as mock_objects_delete:
o.delete(wait_for_deletion=False)
self.assertEqual(1, mock_objects_delete.call_count)
# storage.objects.list shouldn't have been called with
# wait_for_deletion=False.
self.assertEqual(0, mock_objects_list.call_count)

with mock.patch.object(google.datalab.storage._api.Api, 'objects_delete',
autospec=True) as mock_objects_delete:
o.delete()
self.assertEqual(1, mock_objects_delete.call_count)
# storage.objects.list should have been called three times with
# wait_for_deletion=True:
# * twice on the first run, paging through all results, with the object
# still present in the bucket, and
# * once on a second run, now with no object present in the list.
self.assertEqual(3, mock_objects_list.call_count)

@staticmethod
def _create_bucket(name='test_bucket'):
return google.datalab.storage.Bucket(name, context=TestCases._create_context())
Expand Down

0 comments on commit 3c7e44d

Please sign in to comment.