Skip to content

Commit

Permalink
raft: persist vote for self together with term bump
Browse files Browse the repository at this point in the history
Commit c9155ac ("raft: persist new term and vote separately") made
the nodes persist new term and vote separately, using 2 WAL writes.
Writing the term first is needed to flush all the ongoing transactions,
so that the node's vclock is updated and can be checked against the
candidate's vclock. Otherwise it could happen that the node persists a
vote for some candidate only to find that it's vclock would actually
become incomparable with the candidate's.

Actually, this guard is not needed when checking a vote for self,
because a node can always vote for self. Besides, splitting term bump
and vote can lead to increased probability of split-vote. It may happen
that a candidate bumps and broadcasts the new term without a vote,
making other nodes vote for self. Let's go back to writing term and vote
together for self votes.

This change makes raft candidate persist term bump and vote for self in
one WAL write instead of two, so all the tests which count WAL writes or
expect 2 separate state updates for term and vote are rewritten.

Prerequisite tarantool#8497

NO_DOC=not user-visible
NO_CHANGELOG=not user-visible
  • Loading branch information
sergepetrenko committed May 29, 2023
1 parent 3b244fc commit 395346e
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 119 deletions.
18 changes: 9 additions & 9 deletions src/lib/raft/raft.c
Expand Up @@ -740,6 +740,15 @@ raft_worker_handle_io(struct raft *raft)
assert(raft->volatile_term >= raft->term);
if (raft->volatile_vote == 0)
goto do_dump;
/*
* Skip self. When vote is issued, own vclock can be smaller,
* but that doesn't matter. Can always vote for self. Not having
* this special case still works if the node is configured as a
* candidate, but the node might log that it canceled a vote for
* self, which is confusing.
*/
if (raft->volatile_vote == raft->self)
goto do_dump_with_vote;
/*
* Vote and term bumps are persisted separately. This serves as
* a flush of all transactions going to WAL right now so as the
Expand All @@ -750,15 +759,6 @@ raft_worker_handle_io(struct raft *raft)
*/
if (raft->volatile_term > raft->term)
goto do_dump;
/*
* Skip self. When vote was issued, own vclock could be smaller,
* but that doesn't matter. Can always vote for self. Not having
* this special case still works if the node is configured as a
* candidate, but the node might log that it canceled a vote for
* self, which is confusing.
*/
if (raft->volatile_vote == raft->self)
goto do_dump_with_vote;
if (!raft_can_vote_for(raft, &raft->candidate_vclock)) {
say_info("RAFT: vote request for %u is canceled - the "
"vclock is not acceptable anymore",
Expand Down
37 changes: 16 additions & 21 deletions test/replication/election_basic.result
Expand Up @@ -338,68 +338,63 @@ box.cfg{election_mode='candidate'}
| ---
| ...

test_run:wait_cond(function() return #election_tbl == 4 end)
test_run:wait_cond(function() return #election_tbl == 3 end)
| ---
| - true
| ...
assert(election_tbl[1].state == 'follower')
| ---
| - true
| ...
assert(election_tbl[2].state == 'follower')
| ---
| - true
| ...
assert(election_tbl[2].term > election_tbl[1].term)
| ---
| - true
| ...
-- Vote is visible here already, but it is volatile.
assert(election_tbl[2].vote == 1)
| ---
| - true
| ...
assert(election_tbl[3].state == 'candidate')
assert(election_tbl[2].state == 'candidate')
| ---
| - true
| ...
assert(election_tbl[3].vote == 1)
assert(election_tbl[2].vote == 1)
| ---
| - true
| ...
assert(election_tbl[4].state == 'leader')
assert(election_tbl[3].state == 'leader')
| ---
| - true
| ...

box.cfg{election_mode='voter'}
| ---
| ...
test_run:wait_cond(function() return #election_tbl == 5 end)
test_run:wait_cond(function() return #election_tbl == 4 end)
| ---
| - true
| ...
assert(election_tbl[5].state == 'follower')
assert(election_tbl[4].state == 'follower')
| ---
| - true
| ...

box.cfg{election_mode='off'}
| ---
| ...
test_run:wait_cond(function() return #election_tbl == 6 end)
test_run:wait_cond(function() return #election_tbl == 5 end)
| ---
| - true
| ...

box.cfg{election_mode='manual'}
| ---
| ...
test_run:wait_cond(function() return #election_tbl == 7 end)
test_run:wait_cond(function() return #election_tbl == 6 end)
| ---
| - true
| ...
assert(election_tbl[7].state == 'follower')
assert(election_tbl[6].state == 'follower')
| ---
| - true
| ...
Expand All @@ -408,32 +403,32 @@ box.ctl.promote()
| ---
| ...

test_run:wait_cond(function() return #election_tbl == 10 end)
test_run:wait_cond(function() return #election_tbl == 9 end)
| ---
| - true
| ...
assert(election_tbl[8].state == 'follower')
assert(election_tbl[7].state == 'follower')
| ---
| - true
| ...
assert(election_tbl[8].term == election_tbl[7].term + 1)
assert(election_tbl[7].term == election_tbl[6].term + 1)
| ---
| - true
| ...
-- Vote is visible here already, but it is volatile.
assert(election_tbl[8].vote == 1)
assert(election_tbl[7].vote == 1)
| ---
| - true
| ...
assert(election_tbl[9].state == 'candidate')
assert(election_tbl[8].state == 'candidate')
| ---
| - true
| ...
assert(election_tbl[9].vote == 1)
assert(election_tbl[8].vote == 1)
| ---
| - true
| ...
assert(election_tbl[10].state == 'leader')
assert(election_tbl[9].state == 'leader')
| ---
| - true
| ...
Expand Down
32 changes: 15 additions & 17 deletions test/replication/election_basic.test.lua
Expand Up @@ -144,37 +144,35 @@ _ = box.ctl.on_election(trig)
box.cfg{replication_synchro_quorum=2}
box.cfg{election_mode='candidate'}

test_run:wait_cond(function() return #election_tbl == 4 end)
test_run:wait_cond(function() return #election_tbl == 3 end)
assert(election_tbl[1].state == 'follower')
assert(election_tbl[2].state == 'follower')
assert(election_tbl[2].term > election_tbl[1].term)
-- Vote is visible here already, but it is volatile.
assert(election_tbl[2].vote == 1)
assert(election_tbl[3].state == 'candidate')
assert(election_tbl[3].vote == 1)
assert(election_tbl[4].state == 'leader')
assert(election_tbl[2].state == 'candidate')
assert(election_tbl[2].vote == 1)
assert(election_tbl[3].state == 'leader')

box.cfg{election_mode='voter'}
test_run:wait_cond(function() return #election_tbl == 5 end)
assert(election_tbl[5].state == 'follower')
test_run:wait_cond(function() return #election_tbl == 4 end)
assert(election_tbl[4].state == 'follower')

box.cfg{election_mode='off'}
test_run:wait_cond(function() return #election_tbl == 6 end)
test_run:wait_cond(function() return #election_tbl == 5 end)

box.cfg{election_mode='manual'}
test_run:wait_cond(function() return #election_tbl == 7 end)
assert(election_tbl[7].state == 'follower')
test_run:wait_cond(function() return #election_tbl == 6 end)
assert(election_tbl[6].state == 'follower')

box.ctl.promote()

test_run:wait_cond(function() return #election_tbl == 10 end)
assert(election_tbl[8].state == 'follower')
assert(election_tbl[8].term == election_tbl[7].term + 1)
test_run:wait_cond(function() return #election_tbl == 9 end)
assert(election_tbl[7].state == 'follower')
assert(election_tbl[7].term == election_tbl[6].term + 1)
-- Vote is visible here already, but it is volatile.
assert(election_tbl[7].vote == 1)
assert(election_tbl[8].state == 'candidate')
assert(election_tbl[8].vote == 1)
assert(election_tbl[9].state == 'candidate')
assert(election_tbl[9].vote == 1)
assert(election_tbl[10].state == 'leader')
assert(election_tbl[9].state == 'leader')

test_run:cmd('stop server replica')
test_run:cmd('delete server replica')
Expand Down

0 comments on commit 395346e

Please sign in to comment.