Skip to content

Commit

Permalink
Adds an emergency_mode="_debug_recommit"
Browse files Browse the repository at this point in the history
  • Loading branch information
VeXocide committed Sep 1, 2015
1 parent 69461f6 commit 072575e
Show file tree
Hide file tree
Showing 14 changed files with 46 additions and 26 deletions.
Expand Up @@ -284,7 +284,7 @@ bool artificial_reql_cluster_interface_t::db_reconfigure(
bool artificial_reql_cluster_interface_t::table_emergency_repair(
counted_t<const ql::db_t> db,
const name_string_t &name,
bool allow_erase,
emergency_repair_mode_t mode,
bool dry_run,
signal_t *interruptor,
ql::datum_t *result_out,
Expand All @@ -296,7 +296,7 @@ bool artificial_reql_cluster_interface_t::table_emergency_repair(
query_state_t::FAILED};
return false;
}
return next->table_emergency_repair(db, name, allow_erase, dry_run, interruptor,
return next->table_emergency_repair(db, name, mode, dry_run, interruptor,
result_out, error_out);
}

Expand Down
Expand Up @@ -135,7 +135,7 @@ class artificial_reql_cluster_interface_t : public reql_cluster_interface_t {
bool table_emergency_repair(
counted_t<const ql::db_t> db,
const name_string_t &name,
bool allow_erase,
emergency_repair_mode_t,
bool dry_run,
signal_t *interruptor,
ql::datum_t *result_out,
Expand Down
13 changes: 7 additions & 6 deletions src/clustering/administration/real_reql_cluster_interface.cc
Expand Up @@ -700,7 +700,7 @@ bool real_reql_cluster_interface_t::db_reconfigure(
void real_reql_cluster_interface_t::emergency_repair_internal(
const counted_t<const ql::db_t> &db,
const namespace_id_t &table_id,
bool allow_erase,
emergency_repair_mode_t mode,
bool dry_run,
signal_t *interruptor_on_home,
ql::datum_t *result_out)
Expand Down Expand Up @@ -734,18 +734,19 @@ void real_reql_cluster_interface_t::emergency_repair_internal(
bool erase_found;
table_meta_client->emergency_repair(
table_id,
allow_erase,
mode,
dry_run,
interruptor_on_home,
&new_config,
&rollback_found,
&erase_found);

if (!rollback_found) {
if (!rollback_found && mode != emergency_repair_mode_t::DEBUG_RECOMMIT) {
if (!erase_found) {
throw admin_op_exc_t("This table doesn't need to be repaired.",
query_state_t::FAILED);
} else if (erase_found && !allow_erase) {
} else if (erase_found &&
mode != emergency_repair_mode_t::UNSAFE_ROLLBACK_OR_ERASE) {
throw admin_op_exc_t(
"One or more shards of this table have no available "
"replicas. Since there are no available copies of the data that was "
Expand Down Expand Up @@ -788,7 +789,7 @@ void real_reql_cluster_interface_t::emergency_repair_internal(
bool real_reql_cluster_interface_t::table_emergency_repair(
counted_t<const ql::db_t> db,
const name_string_t &name,
bool allow_erase,
emergency_repair_mode_t mode,
bool dry_run,
signal_t *interruptor_on_caller,
ql::datum_t *result_out,
Expand All @@ -800,7 +801,7 @@ bool real_reql_cluster_interface_t::table_emergency_repair(
on_thread_t thread_switcher(home_thread());
namespace_id_t table_id;
table_meta_client->find(db->id, name, &table_id);
emergency_repair_internal(db, table_id, allow_erase, dry_run,
emergency_repair_internal(db, table_id, mode, dry_run,
&interruptor_on_home, result_out);
return true;
} catch (const admin_op_exc_t &msg) {
Expand Down
4 changes: 2 additions & 2 deletions src/clustering/administration/real_reql_cluster_interface.hpp
Expand Up @@ -125,7 +125,7 @@ class real_reql_cluster_interface_t :
bool table_emergency_repair(
counted_t<const ql::db_t> db,
const name_string_t &name,
bool allow_erase,
emergency_repair_mode_t,
bool dry_run,
signal_t *interruptor,
ql::datum_t *result_out,
Expand Down Expand Up @@ -234,7 +234,7 @@ class real_reql_cluster_interface_t :
void emergency_repair_internal(
const counted_t<const ql::db_t> &db,
const namespace_id_t &table_id,
bool allow_erase,
emergency_repair_mode_t mode,
bool dry_run,
signal_t *interruptor,
ql::datum_t *result_out)
Expand Down
10 changes: 8 additions & 2 deletions src/clustering/table_contract/emergency_repair.cc
Expand Up @@ -38,13 +38,19 @@ bool any_dead(
void calculate_emergency_repair(
const table_raft_state_t &old_state,
const std::set<server_id_t> &dead_servers,
bool allow_erase,
emergency_repair_mode_t mode,
table_raft_state_t *new_state_out,
bool *rollback_found_out,
bool *erase_found_out) {
*rollback_found_out = false;
*erase_found_out = false;

/* If we're in `"_debug_recommit"` mode we simply copy the old state and are done */
if (mode == emergency_repair_mode_t::DEBUG_RECOMMIT) {
*new_state_out = old_state;
return;
}

/* Pick the server we'll use as a replacement for shards that we end up erasing */
server_id_t erase_replacement = nil_uuid();
for (const auto &pair : old_state.member_ids) {
Expand All @@ -62,7 +68,7 @@ void calculate_emergency_repair(
contract_t contract = pair.second.second;
if (all_dead(contract.replicas, dead_servers)) {
*erase_found_out = true;
if (allow_erase) {
if (mode == emergency_repair_mode_t::UNSAFE_ROLLBACK_OR_ERASE) {
/* Discard all previous replicas, and set up a new empty replica on
`erase_replacement`. */
contract = contract_t();
Expand Down
2 changes: 1 addition & 1 deletion src/clustering/table_contract/emergency_repair.hpp
Expand Up @@ -13,7 +13,7 @@ fix it unless `allow_erase` is `true`. */
void calculate_emergency_repair(
const table_raft_state_t &old_state,
const std::set<server_id_t> &dead_servers,
bool allow_erase,
emergency_repair_mode_t mode,
table_raft_state_t *new_state_out,
bool *rollback_found_out,
bool *erase_found_out);
Expand Down
7 changes: 4 additions & 3 deletions src/clustering/table_manager/table_meta_client.cc
Expand Up @@ -439,7 +439,7 @@ void table_meta_client_t::set_config(

void table_meta_client_t::emergency_repair(
const namespace_id_t &table_id,
bool allow_erase,
emergency_repair_mode_t mode,
bool dry_run,
signal_t *interruptor_on_caller,
table_config_and_shards_t *new_config_out,
Expand Down Expand Up @@ -480,14 +480,15 @@ void table_meta_client_t::emergency_repair(
calculate_emergency_repair(
old_state,
dead_servers,
allow_erase,
mode,
&new_state,
rollback_found_out,
erase_found_out);

*new_config_out = new_state.config;

if ((*rollback_found_out || *erase_found_out) && !dry_run) {
if ((*rollback_found_out || *erase_found_out ||
mode == emergency_repair_mode_t::DEBUG_RECOMMIT) && !dry_run) {
/* In theory, we don't always have to start a new epoch. Sometimes we run an
emergency repair where we've lost a quorum of one shard, but still have a quorum
of the Raft cluster as a whole. In that case we could run a regular Raft
Expand Down
2 changes: 1 addition & 1 deletion src/clustering/table_manager/table_meta_client.hpp
Expand Up @@ -191,7 +191,7 @@ class table_meta_client_t :
detected. */
void emergency_repair(
const namespace_id_t &table_id,
bool allow_erase,
emergency_repair_mode_t mode,
bool dry_run,
signal_t *interruptor,
table_config_and_shards_t *new_config_out,
Expand Down
4 changes: 4 additions & 0 deletions src/protocol_api.hpp
Expand Up @@ -133,6 +133,10 @@ ARCHIVE_PRIM_MAKE_RANGED_SERIALIZABLE(
reql_version_t, int8_t,
reql_version_t::EARLIEST, reql_version_t::LATEST);

enum class emergency_repair_mode_t { DEBUG_RECOMMIT,
UNSAFE_ROLLBACK,
UNSAFE_ROLLBACK_OR_ERASE };

/* `backfill_item_memory_tracker_t` is used by the backfilling logic to control the
memory usage on the backfill sender. It is updated whenever a key/value pair is
loaded, or a new backfill_item_t structure is allocated. */
Expand Down
2 changes: 1 addition & 1 deletion src/rdb_protocol/context.hpp
Expand Up @@ -285,7 +285,7 @@ class reql_cluster_interface_t {
virtual bool table_emergency_repair(
counted_t<const ql::db_t> db,
const name_string_t &name,
bool allow_data_loss,
emergency_repair_mode_t,
bool dry_run,
signal_t *interruptor,
ql::datum_t *result_out,
Expand Down
12 changes: 7 additions & 5 deletions src/rdb_protocol/terms/db_table.cc
Expand Up @@ -553,11 +553,13 @@ class reconfigure_term_t : public table_or_db_meta_term_t {

/* Parse `emergency_repair` to figure out which kind we're doing. */
datum_string_t emergency_repair_str = emergency_repair->as_str();
bool allow_erase;
if (emergency_repair_str == "unsafe_rollback") {
allow_erase = false;
emergency_repair_mode_t mode;
if (emergency_repair_str == "_debug_recommit") {
mode = emergency_repair_mode_t::DEBUG_RECOMMIT;
} else if (emergency_repair_str == "unsafe_rollback") {
mode = emergency_repair_mode_t::UNSAFE_ROLLBACK;
} else if (emergency_repair_str == "unsafe_rollback_or_erase") {
allow_erase = true;
mode = emergency_repair_mode_t::UNSAFE_ROLLBACK_OR_ERASE;
} else {
rfail_target(emergency_repair.get(), base_exc_t::LOGIC,
"`emergency_repair` should be \"unsafe_rollback\" or "
Expand All @@ -583,7 +585,7 @@ class reconfigure_term_t : public table_or_db_meta_term_t {
datum_t result;
admin_err_t error;
bool success = env->env->reql_cluster_interface()->table_emergency_repair(
db, *name_if_table, allow_erase, dry_run,
db, *name_if_table, mode, dry_run,
env->env->interruptor, &result, &error);
if (!success) {
REQL_RETHROW(error);
Expand Down
2 changes: 1 addition & 1 deletion src/unittest/rdb_env.cc
Expand Up @@ -554,7 +554,7 @@ bool test_rdb_env_t::instance_t::db_reconfigure(
bool test_rdb_env_t::instance_t::table_emergency_repair(
UNUSED counted_t<const ql::db_t> db,
UNUSED const name_string_t &name,
UNUSED bool allow_erase,
UNUSED emergency_repair_mode_t mode,
UNUSED bool dry_run,
UNUSED signal_t *local_interruptor,
UNUSED ql::datum_t *result_out,
Expand Down
2 changes: 1 addition & 1 deletion src/unittest/rdb_env.hpp
Expand Up @@ -233,7 +233,7 @@ class test_rdb_env_t {
bool table_emergency_repair(
counted_t<const ql::db_t> db,
const name_string_t &name,
bool allow_erase,
emergency_repair_mode_t,
bool dry_run,
signal_t *interruptor,
ql::datum_t *result_out,
Expand Down
6 changes: 6 additions & 0 deletions test/interface/emergency_repair.py
Expand Up @@ -47,6 +47,8 @@ def make_table(name, shards):
res = r.table(name).insert({"number": i} for i in xrange(docs_per_table)).run(conn)
assert res.get("inserted") == 100

make_table("recommit", [
{"primary_replica": "a", "replicas": ["a"]}])
make_table("no_repair_1", [
{"primary_replica": "a", "replicas": ["a"]}])
make_table("no_repair_2", [
Expand Down Expand Up @@ -155,6 +157,9 @@ def check_table_half(name, wait_for="ready_for_writes"):
assert 0.25*docs_per_table < count < 0.75*docs_per_table, \
("Found %d rows, expected about %d" % (count, 0.50*docs_per_table))

repair("recommit", "_debug_recommit",
[{"primary_replica": "a", "replicas": ["a"]}])

# `no_repair_1` is hosted only on server "a"
check_table("no_repair_1")
bad_repair("no_repair_1", "unsafe_rollback",
Expand Down Expand Up @@ -211,6 +216,7 @@ def check_table_half(name, wait_for="ready_for_writes"):
new_x.wait_until_started_up()

# Make sure that the reappearance of the dead server doesn't break anything
check_table("recommit", wait_for="all_replicas_ready")
check_table("no_repair_1", wait_for="all_replicas_ready")
check_table("no_repair_2", wait_for="all_replicas_ready")
check_table("rollback", wait_for="all_replicas_ready")
Expand Down

0 comments on commit 072575e

Please sign in to comment.