Skip to content

Commit

Permalink
box: disable split-brain detection until schema is upgraded
Browse files Browse the repository at this point in the history
Our split-brain detection machinery relies among other things on all
nodes tracking the synchro queue confirmed lsn. This tracking was only
added together with the split-brain detection. Only the synchro queue
owner tracked the confirmed lsn before.

This means that after an upgrade all the replicas remember the latest
confirmed lsn as 0, and any PROMOTE/DEMOTE request from the queue owner
is treated as a split brain.

Let's fix this and only enable split-brain detection on the replica set
once the schema version is updated. Thanks to the synchro queue freeze
on restart, this can only happen after a new PROMOTE or DEMOTE entry is
written by one of the nodes, and thus the correct confirmed lsn
is propagated with this PROMOTE/DEMOTE to all the cluster members.

Closes tarantool#8996

NO_DOC=bugfix

(cherry picked from commit a844bd3)
  • Loading branch information
sergepetrenko committed Sep 28, 2023
1 parent 2757a84 commit 6b8a39f
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 2 deletions.
4 changes: 4 additions & 0 deletions changelogs/unreleased/gh-8996-spurious-spit-brain-detected.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
## bugfix/replication

* Fixed a false-positive split-brain in a replica set on the first
promotion after an upgrade from versions before 2.10.1 (gh-8996).
34 changes: 34 additions & 0 deletions src/box/alter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "coll_id_cache.h"
#include "coll_id_def.h"
#include "txn.h"
#include "txn_limbo.h"
#include "tuple.h"
#include "tuple_constraint.h"
#include "fiber.h" /* for gc_pool */
Expand Down Expand Up @@ -4010,6 +4011,20 @@ on_replace_dd_priv(struct trigger * /* trigger */, void *event)

/* {{{ cluster configuration */

static int
start_synchro_filtering(va_list /* ap */)
{
txn_limbo_filter_enable(&txn_limbo);
return 0;
}

static int
stop_synchro_filtering(va_list /* ap */)
{
txn_limbo_filter_disable(&txn_limbo);
return 0;
}

/**
* This trigger is invoked only upon initial recovery, when
* reading contents of the system spaces from the snapshot.
Expand Down Expand Up @@ -4061,6 +4076,25 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event)
*/
dd_version_id = tarantool_version_id();
}
if (recovery_state != FINISHED_RECOVERY) {
return 0;
}
struct fiber *fiber = NULL;
if (dd_version_id > version_id(2, 10, 1) &&
recovery_state == FINISHED_RECOVERY) {
fiber = fiber_new_system("synchro_filter_enabler",
start_synchro_filtering);
if (fiber == NULL)
return -1;
fiber_wakeup(fiber);
} else if (dd_version_id <= version_id(2, 10, 1) &&
recovery_state == FINISHED_RECOVERY) {
fiber = fiber_new_system("synchro_filter_disabler",
stop_synchro_filtering);
if (fiber == NULL)
return -1;
fiber_wakeup(fiber);
}
}
return 0;
}
Expand Down
7 changes: 5 additions & 2 deletions src/box/box.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4974,9 +4974,12 @@ box_cfg_xc(void)
/*
* Enable split brain detection once node is fully recovered or
* bootstrapped. No split brain could happen during bootstrap or local
* recovery.
* recovery. Only do so in an upgraded cluster. Unfortunately, schema
* version 2.10.1 was used in 2.10.0 release, while split-brain
* detection appeared in 2.10.1. So use the schema version after 2.10.1.
*/
txn_limbo_filter_enable(&txn_limbo);
if (dd_version_id > version_id(2, 10, 1))
txn_limbo_filter_enable(&txn_limbo);

title("running");
say_info("ready to accept requests");
Expand Down
8 changes: 8 additions & 0 deletions src/box/txn_limbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -1273,6 +1273,14 @@ txn_limbo_filter_enable(struct txn_limbo *limbo)
latch_unlock(&limbo->promote_latch);
}

void
txn_limbo_filter_disable(struct txn_limbo *limbo)
{
latch_lock(&limbo->promote_latch);
limbo->do_validate = false;
latch_unlock(&limbo->promote_latch);
}

void
txn_limbo_init(void)
{
Expand Down
4 changes: 4 additions & 0 deletions src/box/txn_limbo.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,10 @@ txn_limbo_on_parameters_change(struct txn_limbo *limbo);
void
txn_limbo_filter_enable(struct txn_limbo *limbo);

/** Stop filtering incoming synchro requests. */
void
txn_limbo_filter_disable(struct txn_limbo *limbo);

/**
* Freeze limbo. Prevent CONFIRMs and ROLLBACKs until limbo is unfrozen.
*/
Expand Down
78 changes: 78 additions & 0 deletions test/replication-luatest/gh_8996_synchro_filter_enable_test.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
local t = require('luatest')
local replica_set = require('luatest.replica_set')
local server = require('luatest.server')

local g = t.group('synchro-filter-enable-by-version')

g.before_each(function(cg)
cg.replica_set = replica_set:new{}
cg.box_cfg = {
replication = {
server.build_listen_uri('server1', cg.replica_set.id),
server.build_listen_uri('server2', cg.replica_set.id),
},
replication_timeout = 0.1,
}
for i = 1,2 do
cg['server' .. i] = cg.replica_set:build_and_add_server{
alias = 'server' .. i,
box_cfg = cg.box_cfg,
}
end
end)

g.after_each(function(cg)
cg.replica_set:drop()
end)

-- Check that split-brain detection does not work with schema version <=
-- 2.10.1, and is re-enabled back after a schema upgrade.
g.test_filter_enable_disable = function(cg)
cg.replica_set:start()
cg.server1:exec(function()
box.ctl.wait_rw()
box.schema.downgrade('2.10.1')
t.assert_equals(box.space._schema:get{'version'},
{'version', 2, 10, 1})
end)
cg.server2:wait_for_vclock_of(cg.server1)

cg.server1:update_box_cfg({replication = ""})
cg.server2:update_box_cfg({replication = ""})

cg.server1:exec(function()
box.ctl.promote()
end)
cg.server2:exec(function()
box.ctl.promote()
end)

cg.server1:update_box_cfg(cg.box_cfg)
cg.server2:update_box_cfg(cg.box_cfg)
cg.server1:wait_for_vclock_of(cg.server2)
cg.server2:wait_for_vclock_of(cg.server1)
cg.server1:assert_follows_upstream(cg.server2:get_instance_id())
cg.server2:assert_follows_upstream(cg.server1:get_instance_id())

cg.server1:update_box_cfg({replication = ""})
cg.server2:update_box_cfg({replication = ""})

for i = 1,2 do
cg['server' .. i]:exec(function()
box.ctl.promote()
box.schema.upgrade()
end)
end

t.helpers.retrying({}, function()
for i = 1,2 do
cg['server' .. i]:update_box_cfg(cg.box_cfg)
cg['server' .. i]:exec(function(id)
t.assert_equals(box.info.replication[id].upstream.status,
'stopped')
t.assert_str_contains(box.info.replication[id].upstream.message,
'Split-Brain discovered')
end, {cg['server' .. 3 - i]:get_instance_id()})
end
end)
end

0 comments on commit 6b8a39f

Please sign in to comment.