From ba5279577726fa20e8c46262a5dcf6752b9f8658 Mon Sep 17 00:00:00 2001 From: Serge Petrenko Date: Fri, 19 May 2023 15:40:39 +0300 Subject: [PATCH] raft: persist vote for self together with term bump Commit c9155ac86362 ("raft: persist new term and vote separately") made the nodes persist new term and vote separately, using 2 WAL writes. Writing the term first is needed to flush all the ongoing transactions, so that the node's vclock is updated and can be checked against the candidate's vclock. Otherwise it could happen that the node persists a vote for some candidate only to find that it's vclock would actually become incomparable with the candidate's. Actually, this guard is not needed when checking a vote for self, because a node can always vote for self. Let's go back to writing term and vote together for self votes. Prerequisite #8497 NO_DOC=not user-visible NO_CHANGELOG=not user-visible NO_TEST=covered by the next commit --- src/lib/raft/raft.c | 18 +++++++++--------- test/unit/raft.c | 46 ++++++++++++++++----------------------------- 2 files changed, 25 insertions(+), 39 deletions(-) diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c index dd28a182c8d3..72d914971b01 100644 --- a/src/lib/raft/raft.c +++ b/src/lib/raft/raft.c @@ -740,6 +740,15 @@ raft_worker_handle_io(struct raft *raft) assert(raft->volatile_term >= raft->term); if (raft->volatile_vote == 0) goto do_dump; + /* + * Skip self. When vote is issued, own vclock can be smaller, + * but that doesn't matter. Can always vote for self. Not having + * this special case still works if the node is configured as a + * candidate, but the node might log that it canceled a vote for + * self, which is confusing. + */ + if (raft->volatile_vote == raft->self) + goto do_dump_with_vote; /* * Vote and term bumps are persisted separately. This serves as * a flush of all transactions going to WAL right now so as the @@ -750,15 +759,6 @@ raft_worker_handle_io(struct raft *raft) */ if (raft->volatile_term > raft->term) goto do_dump; - /* - * Skip self. When vote was issued, own vclock could be smaller, - * but that doesn't matter. Can always vote for self. Not having - * this special case still works if the node is configured as a - * candidate, but the node might log that it canceled a vote for - * self, which is confusing. - */ - if (raft->volatile_vote == raft->self) - goto do_dump_with_vote; if (!raft_can_vote_for(raft, &raft->candidate_vclock)) { say_info("RAFT: vote request for %u is canceled - the " "vclock is not acceptable anymore", diff --git a/test/unit/raft.c b/test/unit/raft.c index 1055b876875f..d80df1efc65f 100644 --- a/test/unit/raft.c +++ b/test/unit/raft.c @@ -68,7 +68,7 @@ raft_test_leader_election(void) 1 /* Vote. */, 2 /* Volatile term. */, 1 /* Volatile vote. */, - "{0: 2}" /* Vclock. */ + "{0: 1}" /* Vclock. */ ), "elections with a new term"); is(raft_vote_count(&node.raft), 1, "single vote for self"); ok(node.update_count > 0, "trigger worked"); @@ -76,36 +76,22 @@ raft_test_leader_election(void) /* Check if all async work is done properly. */ - is(node.journal.size, 2, "2 records in the journal"); + is(node.journal.size, 1, "2 records in the journal"); ok(raft_node_journal_check_row( &node, 0 /* Index. */, 2 /* Term. */, - 0 /* Vote. */ - ), "term is on disk"); - ok(raft_node_journal_check_row( - &node, - 1 /* Index. */, - 2 /* Term. */, 1 /* Vote. */ - ), "vote is on disk"); + ), "term and vote is on disk"); - is(node.net.count, 2, "2 pending messages"); + is(node.net.count, 1, "1 pending message"); ok(raft_node_net_check_msg(&node, 0 /* Index. */, RAFT_STATE_FOLLOWER /* State. */, 2 /* Term. */, - 0 /* Vote. */, - NULL /* Vclock. */ - ), "term bump is sent"); - ok(raft_node_net_check_msg( - &node, - 1 /* Index. */, - RAFT_STATE_CANDIDATE /* State. */, - 2 /* Term. */, 1 /* Vote. */, - "{0: 2}" /* Vclock. */ - ), "vote request is sent"); + NULL /* Vclock. */ + ), "term bump and vote are sent"); raft_node_net_drop(&node); /* Simulate first response. Nothing should happen, quorum is 3. */ @@ -2165,7 +2151,7 @@ raft_test_pre_vote(void) 1 /* Vote. */, 3 /* Volatile term. */, 1 /* Volatile vote. */, - "{0: 3}" /* Vclock. */ + "{0: 2}" /* Vclock. */ ), "elections once no one sees the leader"); raft_node_cfg_election_quorum(&node, 1); @@ -2178,7 +2164,7 @@ raft_test_pre_vote(void) 1 /* Vote. */, 3 /* Volatile term. */, 1 /* Volatile vote. */, - "{0: 3}" /* Vclock. */ + "{0: 2}" /* Vclock. */ ), "become leader on quorum change"); raft_cfg_is_candidate_later(&node.raft, false); @@ -2191,7 +2177,7 @@ raft_test_pre_vote(void) 1 /* Vote. */, 3 /* Volatile term. */, 1 /* Volatile vote. */, - "{0: 3}" /* Vclock. */ + "{0: 2}" /* Vclock. */ ), "cfg_is_candidate_later doesn't disrupt leader"); is(raft_node_send_follower( @@ -2210,7 +2196,7 @@ raft_test_pre_vote(void) 0 /* Vote. */, 4 /* Volatile term. */, 0 /* Volatile vote. */, - "{0: 4}" /* Vclock. */ + "{0: 3}" /* Vclock. */ ), "term bump after cfg_is_candidate_later makes node a voter."); raft_cfg_is_candidate_later(&node.raft, true); @@ -2223,7 +2209,7 @@ raft_test_pre_vote(void) 0 /* Vote. */, 4 /* Volatile term. */, 0 /* Volatile vote. */, - "{0: 4}" /* Vclock. */ + "{0: 3}" /* Vclock. */ ), "cfg_is_candidate_later doesn't transfer voter to a candidate"); is(raft_node_send_follower( @@ -2242,7 +2228,7 @@ raft_test_pre_vote(void) 1 /* Vote. */, 5 /* Volatile term. */, 1 /* Volatile vote. */, - "{0: 6}" /* Vclock. */ + "{0: 5}" /* Vclock. */ ), "Term bump with cfg_is_candidate_later transfers voter to candiate"); is(raft_leader_idle(&node.raft), 0, @@ -2280,7 +2266,7 @@ raft_test_pre_vote(void) 0 /* Vote. */, 6 /* Volatile term. */, 0 /* Volatile vote. */, - "{0: 7}" /* Vclock. */ + "{0: 6}" /* Vclock. */ ), "no elections on start when someone sees the leader"); ok(!raft_ev_is_active(&node.raft.timer), @@ -2297,7 +2283,7 @@ raft_test_pre_vote(void) 0 /* Vote. */, 6 /* Volatile term. */, 0 /* Volatile vote. */, - "{0: 7}" /* Vclock. */ + "{0: 6}" /* Vclock. */ ), "no elections on becoming candidate when someone sees the leader"); ok(!raft_ev_is_active(&node.raft.timer), @@ -2362,7 +2348,7 @@ raft_test_resign(void) 1 /* Vote. */, 2 /* Volatile term. */, 1 /* Volatile vote. */, - "{0: 2}" /* Vclock. */ + "{0: 1}" /* Vclock. */ ), "became leader"); raft_node_resign(&node); @@ -2375,7 +2361,7 @@ raft_test_resign(void) 1 /* Vote. */, 2 /* Volatile term. */, 1 /* Volatile vote. */, - "{0: 2}" /* Vclock. */ + "{0: 1}" /* Vclock. */ ), "resigned from leader state"); raft_node_destroy(&node);