Skip to content

Commit

Permalink
Fix rare race condition on opening ephemeral copy
Browse files Browse the repository at this point in the history
Summary:
Fixes facebook#5079. So what happens was
1. Durable uploads CLOUDMANIFEST
2. Non-durable downloads CLOUDMANIFEST
3. Non-durable tried to open MANIFEST, couldn't find it locally nor in the cloud, assuming it's a new DB.
4. Durable uploads new MANIFEST
5. Non-durable wrote new epoch to its local CLOUDMANIFEST
6. Non-durable failed to open MANIFEST, restarts. At restart, it resuses its local CLOUDMANIFEST without compatible MANIFEST, goes into crash loop.

If 3 and 4 reverses order, non-durable will be able to open the DB. This diff ensures that if this scenario occurs, non-durable will reinitializes the entire directory.

Test Plan: added a unittest to describe such scenario.

Reviewers: dhruba, igor

Reviewed By: igor

Differential Revision: https://rockset.phacility.com/D5571
  • Loading branch information
hicder committed Mar 3, 2020
1 parent 17ec302 commit 244adf6
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 19 deletions.
5 changes: 5 additions & 0 deletions cloud/aws/aws_env.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,11 @@ class AwsEnv : public CloudEnvImpl {
file_deletion_delay_ = delay;
}

Status TEST_DeletePathInS3(const std::string& bucket,
const std::string& fname) {
return DeletePathInS3(bucket, fname);
}

private:
//
// The AWS credentials are specified to the constructor via
Expand Down
46 changes: 39 additions & 7 deletions cloud/cloud_env_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,24 @@ Status CloudEnvImpl::LoadLocalCloudManifest(const std::string& dbname) {
if (cloud_manifest_) {
cloud_manifest_.reset();
}
return CloudEnvImpl::LoadLocalCloudManifest(dbname, GetBaseEnv(),
&cloud_manifest_);
}

Status CloudEnvImpl::LoadLocalCloudManifest(
const std::string& dbname, Env* base_env,
std::unique_ptr<CloudManifest>* cloud_manifest) {
std::unique_ptr<SequentialFile> file;
auto cloudManifestFile = CloudManifestFile(dbname);
auto s =
GetBaseEnv()->NewSequentialFile(cloudManifestFile, &file, EnvOptions());
auto cloud_manifest_file_name = CloudManifestFile(dbname);
auto s = base_env->NewSequentialFile(cloud_manifest_file_name, &file,
EnvOptions());
if (!s.ok()) {
return s;
}
return CloudManifest::LoadFromLog(
std::unique_ptr<SequentialFileReader>(
new SequentialFileReader(std::move(file), cloudManifestFile)),
&cloud_manifest_);
std::unique_ptr<SequentialFileReader>(
new SequentialFileReader(std::move(file), cloud_manifest_file_name)),
cloud_manifest);
}

std::string CloudEnvImpl::RemapFilename(const std::string& logical_path) const {
Expand Down Expand Up @@ -438,14 +445,39 @@ Status CloudEnvImpl::NeedsReinitialization(const std::string& local_dir,
}

// We found a local dbid but we did not find this dbid in bucket registry.
// This is an ephemeral clone.
if (src_object_path.empty() && dest_object_path.empty()) {
Log(InfoLogLevel::ERROR_LEVEL, info_log_,
"[cloud_env_impl] NeedsReinitialization: "
"local dbid %s does not have a mapping in cloud registry "
"src bucket %s or dest bucket %s",
local_dbid.c_str(), src_bucket.c_str(), dest_bucket.c_str());

// This is an ephemeral clone. Resync all files from cloud.
// The local CLOUDMANIFEST on ephemeral clone is by definition out-of-sync
// with the CLOUDMANIFEST in the cloud. That means we need to make sure the
// local MANIFEST is compatible with the local CLOUDMANIFEST. Otherwise
// there is no way we can recover since all MANIFEST files on the cloud are
// only compatible with CLOUDMANIFEST on the cloud.
//
// If the local MANIFEST is not compatible with local CLOUDMANIFEST, we will
// need to reinitialize the entire directory.
std::unique_ptr<CloudManifest> cloud_manifest;
Env* base_env = GetBaseEnv();
Status load_status =
LoadLocalCloudManifest(local_dir, base_env, &cloud_manifest);
if (load_status.ok()) {
std::string current_epoch = cloud_manifest->GetCurrentEpoch().ToString();
Status local_manifest_exists =
base_env->FileExists(ManifestFileWithEpoch(local_dir, current_epoch));
if (!local_manifest_exists.ok()) {
Log(InfoLogLevel::WARN_LEVEL, info_log_,
"[cloud_env_impl] NeedsReinitialization: CLOUDMANIFEST exists "
"locally, but no local MANIFEST is compatible");
return Status::OK();
}
}

// Resync all files from cloud.
// If the resycn failed, then return success to indicate that
// the local directory needs to be completely removed and recreated.
st = ResyncDir(local_dir);
Expand Down
7 changes: 7 additions & 0 deletions cloud/cloud_env_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,14 @@ class CloudEnvImpl : public CloudEnv {
const DbidList& dbid_list, DbidParents* parents);
virtual Status PreloadCloudManifest(const std::string& local_dbname) override;

// Load CLOUDMANIFEST if exists in local disk to current env.
Status LoadLocalCloudManifest(const std::string& dbname);

// Local CLOUDMANIFEST from `base_env` into `cloud_manifest`.
static Status LoadLocalCloudManifest(
const std::string& dbname, Env* base_env,
std::unique_ptr<CloudManifest>* cloud_manifest);

// Transfers the filename from RocksDB's domain to the physical domain, based
// on information stored in CLOUDMANIFEST.
// For example, it will map 00010.sst to 00010.sst-[epoch] where [epoch] is
Expand Down
88 changes: 76 additions & 12 deletions cloud/db_cloud_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@

#include "rocksdb/cloud/db_cloud.h"

#include <cinttypes>
#include <algorithm>
#include <chrono>
#include <cinttypes>

#include "cloud/aws/aws_env.h"
#include "cloud/aws/aws_file.h"
#include "cloud/db_cloud_impl.h"
#include "cloud/filename.h"
#include "cloud/manifest_reader.h"
#include "logging/logging.h"
#include "rocksdb/options.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
Expand Down Expand Up @@ -133,11 +134,11 @@ class CloudTest : public testing::Test {
}

// Creates and Opens a clone
void CloneDB(const std::string& clone_name,
const std::string& dest_bucket_name,
const std::string& dest_object_path,
std::unique_ptr<DBCloud>* cloud_db,
std::unique_ptr<CloudEnv>* cloud_env) {
Status CloneDB(const std::string& clone_name,
const std::string& dest_bucket_name,
const std::string& dest_object_path,
std::unique_ptr<DBCloud>* cloud_db,
std::unique_ptr<CloudEnv>* cloud_env) {
// The local directory where the clone resides
std::string cname = clone_dir_ + "/" + clone_name;

Expand All @@ -153,11 +154,15 @@ class CloudTest : public testing::Test {
copt.dest_bucket.SetBucketName(dest_bucket_name);
}
copt.dest_bucket.SetObjectPath(dest_object_path);
if (! copt.dest_bucket.IsValid()) {
if (!copt.dest_bucket.IsValid()) {
copt.keep_local_sst_files = true;
}
// Create new AWS env
ASSERT_OK(CloudEnv::NewAwsEnv(base_env_, copt, options_.info_log, &cenv));
Status st = CloudEnv::NewAwsEnv(base_env_, copt, options_.info_log, &cenv);
if (!st.ok()) {
return st;
}

// To catch any possible file deletion bugs, we set file deletion delay to
// smallest possible
((AwsEnv*)cenv)->TEST_SetFileDeletionDelay(std::chrono::seconds(0));
Expand All @@ -175,15 +180,20 @@ class CloudTest : public testing::Test {
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cfopt));
std::vector<ColumnFamilyHandle*> handles;

ASSERT_OK(DBCloud::Open(options_, cname, column_families,
persistent_cache_path_, persistent_cache_size_gb_,
&handles, &clone_db));
st = DBCloud::Open(options_, cname, column_families, persistent_cache_path_,
persistent_cache_size_gb_, &handles, &clone_db);
if (!st.ok()) {
return st;
}

cloud_db->reset(clone_db);

// Delete the handle for the default column family because the DBImpl
// always holds a reference to it.
ASSERT_TRUE(handles.size() > 0);
assert(handles.size() > 0);
delete handles[0];

return st;
}

void CloseDB() {
Expand Down Expand Up @@ -1165,6 +1175,60 @@ TEST_F(CloudTest, Ephemeral) {
}
}

// This test is performed in a rare race condition where ephemral clone is
// started after durable clone upload its CLOUDMANIFEST but before it uploads
// one of the MANIFEST. In this case, we want to verify that ephemeral clone is
// able to reinitialize instead of crash looping.
TEST_F(CloudTest, EphemeralOnCorruptedDB) {
cloud_env_options_.keep_local_sst_files = true;
options_.level0_file_num_compaction_trigger = 100; // never compact

OpenDB();

std::vector<std::string> files;
base_env_->GetChildren(dbname_, &files);

// Get the MANIFEST file
std::string manifest_file_name;
for (const auto& file_name : files) {
if (file_name.rfind("MANIFEST", 0) == 0) {
manifest_file_name = file_name;
break;
}
}

ASSERT_FALSE(manifest_file_name.empty());

// Delete MANIFEST file from S3 bucket.
// This is to simulate the scenario where CLOUDMANIFEST is uploaded, but
// MANIFEST is not yet uploaded from the durable shard.
auto aws_env = dynamic_cast<AwsEnv*>(aenv_.get());
ASSERT_TRUE(aws_env != nullptr);
aws_env->TEST_DeletePathInS3(
aws_env->GetSrcBucketName(),
aws_env->GetSrcObjectPath() + "/" + manifest_file_name);

// Ephemeral clone should fail.
std::unique_ptr<DBCloud> clone_db;
std::unique_ptr<CloudEnv> cenv;
Status st = CloneDB("clone1", "", "", &clone_db, &cenv);
ASSERT_NOK(st);

// Put the MANIFEST file back
aws_env->PutObject(dbname_ + "/" + manifest_file_name,
aws_env->GetSrcBucketName(),
aws_env->GetSrcObjectPath() + "/" + manifest_file_name);

// Try one more time. This time it should succeed.
clone_db.reset();
cenv.reset();
st = CloneDB("clone1", "", "", &clone_db, &cenv);
ASSERT_OK(st);

clone_db->Close();
CloseDB();
}

//
// Test Ephemeral clones with resyncOnOpen mode.
// In this mode, every open of the ephemeral clone db causes its
Expand Down

0 comments on commit 244adf6

Please sign in to comment.