Avoid using WindowsFS in ClusterRerouteIT (elastic#52488)

Issue elastic#52000 looks like a case of cluster state updates being slower than expected, but it seems that these slowdowns are relatively rare: most invocations of `testDelayWithALargeAmountOfShards` take well under a minute in CI, but there are occasional failures that take 6+ minutes instead. When it fails like this, cluster state persistence seems generally slow: most are slower than expected, with some small updates even taking over 2 seconds to complete. The failures all have in common that they use `WindowsFS` to emulate Windows' behaviour of refusing to delete files that are still open, by tracking all files (really, inodes) and validating that deleted files are really closed first. There is a suggestion that this is a little slow in the Lucene test framework [1]. To see if we can attribute the slowdown to that common factor, this commit suppresses the use of `WindowsFS` for this test suite. [1] https://github.com/apache/lucene-solr/blob/4a513fa99f638cb65e0cae59bfdf7af410c0327a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleTemporaryFilesCleanup.java#L166
sbourke · Feb 19, 2020 · 6c89d29 · 6c89d29
1 parent 25197e2
commit 6c89d29
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/server/src/test/java/org/elasticsearch/cluster/allocation/ClusterRerouteIT.java b/server/src/test/java/org/elasticsearch/cluster/allocation/ClusterRerouteIT.java
@@ -22,6 +22,7 @@
 import org.apache.logging.log4j.Level;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.apache.lucene.util.LuceneTestCase;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.admin.cluster.reroute.ClusterRerouteResponse;
 import org.elasticsearch.action.admin.cluster.reroute.TransportClusterRerouteAction;
@@ -57,7 +58,6 @@
 import org.elasticsearch.test.ESIntegTestCase.Scope;
 import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.MockLogAppender;
-import org.elasticsearch.test.junit.annotations.TestLogging;
 
 import java.nio.file.Path;
 import java.util.Arrays;
@@ -75,6 +75,8 @@
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 
+// testDelayWithALargeAmountOfShards does a lot of cluster state updates, and WindowsFS slows it down too much (#52000)
+@LuceneTestCase.SuppressFileSystems(value = "WindowsFS")
 @ClusterScope(scope = Scope.TEST, numDataNodes = 0)
 public class ClusterRerouteIT extends ESIntegTestCase {
     private final Logger logger = LogManager.getLogger(ClusterRerouteIT.class);
@@ -188,8 +190,6 @@ public void testRerouteWithAllocateLocalGateway_enableAllocationSettings() throw
         rerouteWithAllocateLocalGateway(commonSettings);
     }
 
-    @TestLogging(reason = "https://github.com/elastic/elasticsearch/issues/52000",
-        value = "org.elasticsearch.gateway.PersistedClusterStateService:DEBUG,org.elasticsearch.cluster.service.MasterService:DEBUG")
     public void testDelayWithALargeAmountOfShards() throws Exception {
         Settings commonSettings = Settings.builder()
                 .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), 1)