Skip to content

Commit

Permalink
replication: fix extraneous split-brain alerting
Browse files Browse the repository at this point in the history
Current split-brain detector implementation raises an error each time a
CONFIRM or ROLLBACK entry is received from the previous synchronous
transaction queue owner. It is assumed that the new queue owner must
have witnessed all the previous CONFIRMS. Besides, according to Raft,
ROLLBACK should never happen.

Actually there is a case when a CONFIRM from an old term is legal: it's
possible that during leader transition old leader writes a CONFIRM for
the same transaction that is confirmed by the new leader's PROMOTE. If
PROMOTE and CONFIRM lsns match there is nothing bad about such
situation.

Symmetrically, when an old leader issues a ROLLBACK with the lsn right
after the new leader's PROMOTE lsn, it is not a split-brain.

Allow such cases by tracking the last confirmed lsn for each synchronous
transaction queue owner and silently nopifying CONFIRMs with an lsn less
than the one recorded and ROLLBACKs with lsn greater than that.

Closes tarantool#9138

NO_DOC=bugfix
  • Loading branch information
sergepetrenko committed Nov 10, 2023
1 parent fa4939b commit 95327e9
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 17 deletions.
6 changes: 6 additions & 0 deletions changelogs/unreleased/gh-9138-relax-split-brain-check.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## bugfix/replication

* Fixed a false-positive split-brain error when an old synchronous transaction
queue owner confirmed the same transactions which were already confirmed by
the new queue owner, or rolled back the same transactions which were rolled
back by the new queue owner (gh-9138).
34 changes: 26 additions & 8 deletions src/box/applier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1516,14 +1516,14 @@ applier_synchro_filter_tx(struct stailq *rows)
* It may happen that we receive the instance's rows via some third
* node, so cannot check for applier->instance_id here.
*/
row = &stailq_first_entry(rows, struct applier_tx_row, next)->row;
row = &stailq_last_entry(rows, struct applier_tx_row, next)->row;
uint64_t term = txn_limbo_replica_term(&txn_limbo, row->replica_id);
assert(term <= txn_limbo.promote_greatest_term);
if (term == txn_limbo.promote_greatest_term)
return;

/*
* We do not nopify promotion/demotion and confirm/rollback.
* We do not nopify promotion/demotion and most of confirm/rollback.
* Such syncrhonous requests should be filtered by txn_limbo to detect
* possible split brain situations.
*
Expand All @@ -1534,14 +1534,10 @@ applier_synchro_filter_tx(struct stailq *rows)
* claimed by someone is a marker of split-brain by itself: consider it
* a synchronous transaction, which is committed with quorum 1.
*/
struct xrow_header *last_row =
&stailq_last_entry(rows, struct applier_tx_row, next)->row;
struct applier_tx_row *item;
if (!last_row->wait_sync) {
if (!iproto_type_is_dml(last_row->type) ||
txn_limbo.owner_id == REPLICA_ID_NIL) {
if (iproto_type_is_dml(row->type) && !row->wait_sync) {
if (txn_limbo.owner_id == REPLICA_ID_NIL)
return;
}
stailq_foreach_entry(item, rows, next) {
row = &item->row;
if (row->type == IPROTO_NOP)
Expand All @@ -1550,6 +1546,28 @@ applier_synchro_filter_tx(struct stailq *rows)
"got an async transaction from an old term");
}
return;
} else if (iproto_type_is_synchro_request(row->type)) {
item = stailq_last_entry(rows, typeof(*item), next);
struct synchro_request req = item->req.synchro;
/* Note! Might be different from row->replica_id. */
uint32_t owner_id = req.replica_id;
int64_t confirmed_lsn =
txn_limbo_replica_confirmed_lsn(&txn_limbo, owner_id);
switch (row->type) {
case IPROTO_RAFT_PROMOTE:
case IPROTO_RAFT_DEMOTE:
return;
case IPROTO_RAFT_CONFIRM:
if (req.lsn > confirmed_lsn)
return;
break;
case IPROTO_RAFT_ROLLBACK:
if (req.lsn <= confirmed_lsn)
return;
break;
default:
unreachable();
}
}
stailq_foreach_entry(item, rows, next) {
row = &item->row;
Expand Down
16 changes: 13 additions & 3 deletions src/box/txn_limbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ txn_limbo_create(struct txn_limbo *limbo)
fiber_cond_create(&limbo->wait_cond);
vclock_create(&limbo->vclock);
vclock_create(&limbo->promote_term_map);
vclock_create(&limbo->confirmed_vclock);
limbo->promote_greatest_term = 0;
latch_create(&limbo->promote_latch);
limbo->confirmed_lsn = 0;
Expand Down Expand Up @@ -431,6 +432,7 @@ txn_limbo_write_confirm(struct txn_limbo *limbo, int64_t lsn)
assert(lsn > limbo->confirmed_lsn);
assert(!limbo->is_in_rollback);
limbo->confirmed_lsn = lsn;
vclock_follow(&limbo->confirmed_vclock, limbo->owner_id, lsn);
txn_limbo_write_synchro(limbo, IPROTO_RAFT_CONFIRM, lsn, 0);
}

Expand Down Expand Up @@ -503,8 +505,10 @@ txn_limbo_read_confirm(struct txn_limbo *limbo, int64_t lsn)
* comparing existing confirm_lsn with the one arriving from a remote
* instance.
*/
if (limbo->confirmed_lsn < lsn)
if (limbo->confirmed_lsn < lsn) {
limbo->confirmed_lsn = lsn;
vclock_follow(&limbo->confirmed_vclock, limbo->owner_id, lsn);
}
}

/**
Expand Down Expand Up @@ -597,8 +601,10 @@ txn_limbo_read_promote(struct txn_limbo *limbo, uint32_t replica_id,
* Only nullify confirmed_lsn when the new value is unknown. I.e. when
* prev_id != replica_id.
*/
if (replica_id != prev_id)
limbo->confirmed_lsn = 0;
if (replica_id != prev_id) {
limbo->confirmed_lsn = vclock_get(&limbo->confirmed_vclock,
replica_id);
}
}

int
Expand Down Expand Up @@ -1108,6 +1114,8 @@ txn_limbo_req_prepare(struct txn_limbo *limbo,
assert(limbo->svp_confirmed_lsn == -1);
limbo->svp_confirmed_lsn = limbo->confirmed_lsn;
limbo->confirmed_lsn = req->lsn;
vclock_reset(&limbo->confirmed_vclock, limbo->owner_id,
req->lsn);
break;
}
/*
Expand All @@ -1129,6 +1137,8 @@ txn_limbo_req_rollback(struct txn_limbo *limbo,
assert(limbo->is_in_rollback);
assert(limbo->svp_confirmed_lsn >= 0);
limbo->confirmed_lsn = limbo->svp_confirmed_lsn;
vclock_reset(&limbo->confirmed_vclock, limbo->owner_id,
limbo->svp_confirmed_lsn);
limbo->svp_confirmed_lsn = -1;
limbo->is_in_rollback = false;
break;
Expand Down
15 changes: 15 additions & 0 deletions src/box/txn_limbo.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ struct txn_limbo {
* except outdated nodes.
*/
struct vclock promote_term_map;
/**
* A vclock containing biggest known confirmed lsns for each previous
* limbo owner.
*/
struct vclock confirmed_vclock;
/**
* The biggest PROMOTE term seen by the instance and persisted in WAL.
* It is related to raft term, but not the same. Synchronous replication
Expand Down Expand Up @@ -267,6 +272,16 @@ txn_limbo_replica_term(const struct txn_limbo *limbo, uint32_t replica_id)
return vclock_get(&limbo->promote_term_map, replica_id);
}

/**
* Return the latest confirmed lsn for the replica with id @replica_id.
*/
static inline int64_t
txn_limbo_replica_confirmed_lsn(const struct txn_limbo *limbo,
uint32_t replica_id)
{
return vclock_get(&limbo->confirmed_vclock, replica_id);
}

/**
* Return the last synchronous transaction in the limbo or NULL when it is
* empty.
Expand Down
75 changes: 69 additions & 6 deletions test/replication-luatest/gh_5295_split_brain_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ g.before_all(function(cg)
replication_synchro_quorum = 1,
replication_synchro_timeout = 0.01,
election_timeout = 0.1,
election_fencing_enabled = false,
election_fencing_mode = 'off',
log_level = 6,
}

Expand Down Expand Up @@ -112,6 +112,15 @@ local function reconnect_and_check_split_brain(cg)
end)
end

local function reconnect_and_check_no_split_brain(cg)
local srv = cg.split_replica
srv:exec(update_replication, {cg.main.net_box_uri})
t.helpers.retrying({}, srv.exec, srv, function()
local upstream = box.info.replication[1].upstream
t.assert_equals(upstream.status, 'follow', 'no split-brain')
end)
end

local function write_promote()
t.assert_not_equals(box.info.synchro.queue.owner, box.info.id,
"Promoting a follower")
Expand Down Expand Up @@ -140,26 +149,80 @@ g.test_async_old_term = function(cg)
reconnect_and_check_split_brain(cg)
end

-- Any unseen sync transaction confirmation from an obsolete term means a
-- A conflicting sync transaction confirmation from an obsolete term means a
-- split-brain.
g.test_confirm_old_term = function(cg)
g.test_bad_confirm_old_term = function(cg)
partition_replica(cg)
cg.split_replica:exec(write_promote)
cg.main:exec(function() box.space.sync:replace{1} end)
reconnect_and_check_split_brain(cg)
end

-- Any unseen sync transaction rollback from an obsolete term means a
-- Obsolete sync transaction confirmation might be fine when it doesn't
-- contradict local history.
g.test_good_confirm_old_term = function(cg)
t.tarantool.skip_if_not_debug()
-- Delay confirmation on the old leader, so that the transaction is included
-- into new leader's PROMOTE, but the old leader writes a CONFIRM for it.
cg.main:exec(function()
local lsn = box.info.lsn
box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 1)
require('fiber').new(function() box.space.sync:replace{1} end)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
end)
partition_replica(cg)
cg.split_replica:exec(write_promote)
cg.main:exec(function()
local lsn = box.info.lsn
box.error.injection.set('ERRINJ_WAL_DELAY', false)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
end)
reconnect_and_check_no_split_brain(cg)
end

-- A conflicting sync transaction rollback from an obsolete term means a
-- split-brain.
g.test_rollback_old_term = function(cg)
g.test_bad_rollback_old_term = function(cg)
t.tarantool.skip_if_not_debug()
-- Delay rollback on the old leader, so that the transaction is included
-- into new leader's PROMOTE, but the old leader writes a ROLLBACK for it.
cg.main:exec(function()
local lsn = box.info.lsn
box.cfg{replication_synchro_quorum = 31}
box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 1)
require('fiber').new(function() box.space.sync:replace{1} end)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
end)
partition_replica(cg)
cg.split_replica:exec(write_promote)
cg.main:exec(function()
local lsn = box.info.lsn
box.error.injection.set('ERRINJ_WAL_DELAY', false)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
box.cfg{replication_synchro_quorum = 1}
end)
reconnect_and_check_split_brain(cg)
end

-- Obsolete sync transaction rollback might be fine when it doesn't contradict
-- local history.
g.test_good_rollback_old_term = function(cg)
partition_replica(cg)
cg.split_replica:exec(write_promote)
cg.main:exec(function()
box.cfg{replication_synchro_quorum = 31}
pcall(box.space.sync.replace, box.space.sync, {1})
box.cfg{replication_synchro_quorum = 1}
end)
reconnect_and_check_split_brain(cg)
reconnect_and_check_no_split_brain(cg)
end

-- Conflicting demote for the same term is a split-brain.
Expand Down

0 comments on commit 95327e9

Please sign in to comment.