Skip to content

Commit

Permalink
RavenDB-5978 Failing Raft tests
Browse files Browse the repository at this point in the history
  • Loading branch information
talweiss1982 authored and ayende committed Dec 29, 2016
1 parent 573610a commit 5039333
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 32 deletions.
22 changes: 19 additions & 3 deletions Rachis/Rachis/RaftEngine.cs
Expand Up @@ -30,6 +30,7 @@ public class RaftEngine : IDisposable
private readonly RaftEngineOptions _raftEngineOptions;
private readonly CancellationTokenSource _eventLoopCancellationTokenSource;
private readonly ManualResetEventSlim _leaderSelectedEvent = new ManualResetEventSlim();
private readonly ManualResetEventSlim _leaderConfirmedEvent = new ManualResetEventSlim();
private TaskCompletionSource<object> _steppingDownCompletionSource;

private Topology _currentTopology;
Expand Down Expand Up @@ -59,7 +60,10 @@ public string CurrentLeader


if (value == null)
{
_leaderSelectedEvent.Reset();
_leaderConfirmedEvent.Reset();
}
else
{
_leaderSelectedEvent.Set();
Expand Down Expand Up @@ -121,8 +125,7 @@ public ProposingCandidacyResult CheckIfThereIsVetoOnBecomingCandidate()
public event Action StateTimeout;
public event Action<LogEntry[]> EntriesAppended;
public event Action<long, long> CommitIndexChanged;
public event Action ElectedAsLeader;

public event Action ElectedAsLeader;
public event Action<TopologyChangeCommand> TopologyChanged;
public event Action TopologyChanging;

Expand Down Expand Up @@ -514,6 +517,11 @@ public bool WaitForLeader(int timeout = 10*1000)
return _leaderSelectedEvent.Wait(timeout, CancellationToken);
}

public bool WaitForLeaderConfirmed(int timeout = 10 * 1000)
{
return _leaderConfirmedEvent.Wait(timeout, CancellationToken);
}

public void AppendCommand(Command command)
{
if (command == null) throw new ArgumentNullException("command");
Expand Down Expand Up @@ -558,7 +566,15 @@ public void ApplyCommits(long from, long to)
// StartTopologyChange(tcc); - not sure why it was needed, see RavenDB-3808 for details
CommitTopologyChange(tcc);
}

var noop = command as NopCommand;
if (noop != null && entry.Term == PersistentState.CurrentTerm)
{
if (_log.IsInfoEnabled)
{
_log.Info($"Raising leaderConfirmedEvent, Term = {entry.Term}, Index = {entry.Index}, Name = {Name}");
}
_leaderConfirmedEvent.Set();
}
OnCommitIndexChanged(oldCommitIndex, CommitIndex);
OnCommitApplied(command);
}
Expand Down
11 changes: 1 addition & 10 deletions Raven.Database/Raft/ClusterManagementHttpClient.cs
Expand Up @@ -358,22 +358,13 @@ public async Task SendLeaveAsync(NodeConnectionInfo node)
if (raftEngine.Options.SelfConnection == node)
{
await raftEngine.StepDownAsync().ConfigureAwait(false);
raftEngine.WaitForLeader();
raftEngine.WaitForLeaderConfirmed();
}
else
{
// before we send remove from cluster wait until configuration will be propaged
// to avoid: Cannot modify the cluster topology when the committed index ___ is in term ___ but the current term is ___
// Wait until the leader finishes committing entries from the current term and try again
await Task.Delay(raftEngine.Options.HeartbeatTimeout * 2).ConfigureAwait(false);

// remove node from cluster by excluding it from topology
await raftEngine.RemoveFromClusterAsync(node).ConfigureAwait(false);

// since both remove from cluster and sendInitialize new cluster are send from current leader
// we have to wait for first topology to apply on removing node
await Task.Delay(raftEngine.Options.HeartbeatTimeout*2).ConfigureAwait(false);

// send information to leaved node to create new single node topology
await SendInitializeNewClusterForAsync(node).ConfigureAwait(false);
return;
Expand Down
13 changes: 7 additions & 6 deletions Raven.Tests.Raft/ClusterBasic.cs
Expand Up @@ -136,13 +136,14 @@ public void CanCreateExtendAndRemoveFromCluster()
ExtendRaftCluster(3); // 5 nodes

ExtendRaftCluster(2); // 7 nodes

for (var i = 0; i < servers.Count; i++)
var removeIndexes = new List<int> {0,2,3,4,5,6};
var rand = new Random();
while (removeIndexes.Count>2)
{
if (i == 1) // already deleted
continue;

RemoveFromCluster(servers[i]);
var popIndex = rand.Next(removeIndexes.Count);
var popServer = servers[removeIndexes[popIndex]];
removeIndexes.RemoveAt(popIndex);
RemoveFromCluster(popServer);
}
}

Expand Down
8 changes: 4 additions & 4 deletions Raven.Tests.Raft/ClusterLeave.cs
Expand Up @@ -18,8 +18,8 @@ public class ClusterLeave : RaftTestBase
private List<DocumentStore> clusterStores;

[Theory]
[InlineData(2)]
[InlineData(3)]
[InlineData(5)]
public void CanLeaveLeaderFromClusterFromLeader(int nodesCount)
{
clusterStores = CreateRaftCluster(nodesCount);
Expand All @@ -28,8 +28,8 @@ public void CanLeaveLeaderFromClusterFromLeader(int nodesCount)
}

[Theory]
[InlineData(2)]
[InlineData(3)]
[InlineData(5)]
public void CanLeaveLeaderFromClusterFromNonLeader(int nodesCount)
{
clusterStores = CreateRaftCluster(nodesCount);
Expand All @@ -39,8 +39,8 @@ public void CanLeaveLeaderFromClusterFromNonLeader(int nodesCount)


[Theory]
[InlineData(2)]
[InlineData(3)]
[InlineData(5)]
public void CanLeaveNonLeaderFromClusterFromLeader(int nodesCount)
{
clusterStores = CreateRaftCluster(nodesCount);
Expand All @@ -49,8 +49,8 @@ public void CanLeaveNonLeaderFromClusterFromLeader(int nodesCount)
}

[Theory]
[InlineData(2)]
[InlineData(3)]
[InlineData(5)]
public void CanLeaveNonLeaderFromClusterFromNonLeader(int nodesCount)
{
clusterStores = CreateRaftCluster(nodesCount);
Expand Down
27 changes: 20 additions & 7 deletions Raven.Tests.Raft/RaftTestBase.cs
Expand Up @@ -198,17 +198,30 @@ public List<DocumentStore> ExtendRaftCluster(int numberOfExtraNodes, string acti

public void RemoveFromCluster(RavenDbServer serverToRemove)
{
var leader = servers.FirstOrDefault(server => server.Options.ClusterManager.Value.IsLeader());
//any tests that fails because of this is invalid.
if (servers.Count <= 2)
throw new InvalidOperationException("Can't remove node from cluster when there are two nodes in the cluster, you need to brutly remove the node.");
var leader = ChooseTheRealLeader();
if (leader == null)
throw new InvalidOperationException("Leader is currently not present, thus can't remove node from cluster");
if (leader == serverToRemove)
{
{
leader.Options.ClusterManager.Value.Engine.StepDownAsync().Wait();
}
else
{
leader.Options.ClusterManager.Value.Engine.RemoveFromClusterAsync(serverToRemove.Options.ClusterManager.Value.Engine.Options.SelfConnection).Wait(10000);
}
leader.Server.Options.ClusterManager.Value.Engine.WaitForLeader();
leader = ChooseTheRealLeader();
leader.Server.Options.ClusterManager.Value.Engine.WaitForLeaderConfirmed();
//this is because a leader chosen event is placed wrongly
//SpinWait.SpinUntil(()=>leader.Options.ClusterManager.Value.Engine.PersistentState.GetLogEntry(leader.Options.ClusterManager.Value.Engine.CommitIndex).Term
// == leader.Options.ClusterManager.Value.Engine.PersistentState.CurrentTerm,TimeSpan.FromSeconds(10));
}
leader.Options.ClusterManager.Value.Engine.RemoveFromClusterAsync(serverToRemove.Options.ClusterManager.Value.Engine.Options.SelfConnection).Wait(10000);
}

private RavenDbServer ChooseTheRealLeader()
{
return servers.OrderByDescending(server => server.Options.ClusterManager.Value.Engine.PersistentState.LastLogEntry().Term)
.ThenByDescending(server => server.Options.ClusterManager.Value.Engine.PersistentState.LastLogEntry().Index)
.FirstOrDefault(server => server.Options.ClusterManager.Value.IsLeader());
}

private void WaitForClusterToBecomeNonStale(IReadOnlyCollection<RavenDbServer> nodes)
Expand Down
4 changes: 2 additions & 2 deletions Raven.Tests.Raft/Snapshotting.cs
Expand Up @@ -58,8 +58,8 @@ public void CanInstallSnapshot()
{
Name = RaftHelper.GetNodeName(newServer.SystemDatabase.TransactionalStorage.Id),
Uri = RaftHelper.GetNodeUrl(newServer.SystemDatabase.Configuration.ServerUrl)
}).Wait(10000));
Assert.True(allNodesFinishedJoining.Wait(10000));
}).Wait(20000));
Assert.True(allNodesFinishedJoining.Wait(20000));

Assert.True(snapshotInstalledMre.Wait(TimeSpan.FromSeconds(5)));
}
Expand Down

0 comments on commit 5039333

Please sign in to comment.