From 9954576fdbfd8407ab3dce30ff63959294bfeaf9 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 5 Jul 2023 05:08:13 -0700 Subject: [PATCH] unsafe recovery: Fixing learner store being ignored error in 7.1 (#6744) ref tikv/pd#6690, ref tikv/pd#6710 Fix learner store/ replica being ignored error in auto detect mode. Signed-off-by: Yang Zhang Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- server/cluster/unsafe_recovery_controller.go | 3 -- .../unsafe_recovery_controller_test.go | 42 +++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/server/cluster/unsafe_recovery_controller.go b/server/cluster/unsafe_recovery_controller.go index d54a331048e..6c2d5101928 100644 --- a/server/cluster/unsafe_recovery_controller.go +++ b/server/cluster/unsafe_recovery_controller.go @@ -706,9 +706,6 @@ func (u *unsafeRecoveryController) getFailedPeers(region *metapb.Region) []*meta var failedPeers []*metapb.Peer for _, peer := range region.Peers { - if peer.Role == metapb.PeerRole_Learner || peer.Role == metapb.PeerRole_DemotingVoter { - continue - } if u.isFailed(peer) { failedPeers = append(failedPeers, peer) } diff --git a/server/cluster/unsafe_recovery_controller_test.go b/server/cluster/unsafe_recovery_controller_test.go index 6ef20fdb6b7..9a9eff293fc 100644 --- a/server/cluster/unsafe_recovery_controller_test.go +++ b/server/cluster/unsafe_recovery_controller_test.go @@ -554,6 +554,48 @@ func TestForceLeaderForCommitMerge(t *testing.T) { re.Equal(demoteFailedVoter, recoveryController.GetStage()) } +// Failed learner replica/ store should be considered by auto-recover. +func TestAutoDetectModeWithOneLearner(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _, opt, _ := newTestScheduleConfig() + cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster()) + cluster.coordinator = newCoordinator(ctx, cluster, hbstream.NewTestHeartbeatStreams(ctx, cluster.meta.GetId(), cluster, true)) + cluster.coordinator.run() + for _, store := range newTestStores(1, "6.0.0") { + re.NoError(cluster.PutStore(store.GetMeta())) + } + recoveryController := newUnsafeRecoveryController(cluster) + re.NoError(recoveryController.RemoveFailedStores(nil, 60, true)) + + storeReport := pdpb.StoreReport{ + PeerReports: []*pdpb.PeerReport{ + { + RaftState: &raft_serverpb.RaftLocalState{LastIndex: 10, HardState: &eraftpb.HardState{Term: 1, Commit: 10}}, + RegionState: &raft_serverpb.RegionLocalState{ + Region: &metapb.Region{ + Id: 1001, + RegionEpoch: &metapb.RegionEpoch{ConfVer: 7, Version: 10}, + Peers: []*metapb.Peer{ + {Id: 11, StoreId: 1}, {Id: 12, StoreId: 2}, {Id: 13, StoreId: 3, Role: metapb.PeerRole_Learner}}}}}, + }, + } + req := newStoreHeartbeat(1, &storeReport) + req.StoreReport.Step = 1 + resp := &pdpb.StoreHeartbeatResponse{} + recoveryController.HandleStoreHeartbeat(req, resp) + hasStore3AsFailedStore := false + for _, failedStore := range resp.RecoveryPlan.ForceLeader.FailedStores { + if failedStore == 3 { + hasStore3AsFailedStore = true + break + } + } + re.True(hasStore3AsFailedStore) +} + func TestAutoDetectMode(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background())