fix(disable): enforce serverGroup.disabled is true (#3836)

Let's simplify the logic in WaitForClusterDisableTask: 1. first make sure that the server group disabled flag is set 2. then delegate to waitForRequiredInstancesDownTask to check the various healths and the more subtle points like whether interestingHealthProviderNames are set or not Motivation: it was possible to get out of WaitForClusterDisableTask with a server group that had Down instances but `disabled: false`, therefore it would still count as an active server group and would be filtered out by downstream stages like scaleDownServerGroup.
spinnaker · Jul 27, 2020 · f89aa30 · f89aa30
1 parent c47e530
commit f89aa30
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 51 deletions.
diff --git a/...ovy/com/netflix/spinnaker/orca/clouddriver/tasks/cluster/WaitForClusterDisableTask.groovy b/...ovy/com/netflix/spinnaker/orca/clouddriver/tasks/cluster/WaitForClusterDisableTask.groovy
@@ -23,13 +23,15 @@ import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.Targe
 import com.netflix.spinnaker.orca.clouddriver.tasks.servergroup.ServerGroupCreator
 import com.netflix.spinnaker.orca.clouddriver.tasks.servergroup.WaitForRequiredInstancesDownTask
 import com.netflix.spinnaker.orca.clouddriver.utils.CloudProviderAware
-import com.netflix.spinnaker.orca.clouddriver.utils.HealthHelper
 import org.springframework.beans.factory.annotation.Autowired
 import org.springframework.stereotype.Component
 import org.springframework.beans.factory.annotation.Value
 
 @Component
 class WaitForClusterDisableTask extends AbstractWaitForClusterWideClouddriverTask implements CloudProviderAware {
+  // to make the logic more readable, we map true and false to words
+  private static final boolean RUNNING = true
+  private static final boolean COMPLETED = false
 
   @Value('${tasks.disable-cluster-min-time-millis:90000}')
   private int MINIMUM_WAIT_TIME_MS
@@ -68,33 +70,22 @@ class WaitForClusterDisableTask extends AbstractWaitForClusterWideClouddriverTas
     // null vs empty interestingHealthProviderNames do mean very different things to Spinnaker
     // a null value will result in Spinnaker waiting for discovery + platform, etc. whereas an empty will not wait for anything.
     if (interestingHealthProviderNames != null && interestingHealthProviderNames.isEmpty()) {
-      return false
+      return COMPLETED
     }
 
     if (!serverGroup.isPresent()) {
-      return false
+      return COMPLETED
     }
 
-    // Even if the server group is disabled, we want to make sure instances are down
-    // to prevent downstream stages (e.g. scaleDownCluster) from having to deal with disabled-but-instances-up server groups
+    // make sure that we wait for the disabled flag to be set
     def targetServerGroup = serverGroup.get()
-    if (targetServerGroup.isDisabled() || stage.context.desiredPercentage) {
-      return !waitForRequiredInstancesDownTask.hasSucceeded(stage, targetServerGroup as Map, targetServerGroup.getInstances(), interestingHealthProviderNames)
+    if (!targetServerGroup.isDisabled()) {
+      return RUNNING
     }
 
-    // TODO(lwander) investigate if the non-desiredPercentage/only-platform-health case code can be dropped in favor of waitForRequiredInstancesDownTask
-    // The operation can be considered complete if it was requested to only consider the platform health.
-    def platformHealthType = getPlatformHealthType(stage, targetServerGroup)
-    return !(platformHealthType && interestingHealthProviderNames == [platformHealthType])
-  }
-
-  private String getPlatformHealthType(StageExecution stage, TargetServerGroup targetServerGroup) {
-    def platformHealthType = targetServerGroup.instances.collect { instance ->
-      HealthHelper.findPlatformHealth(instance.health)
-    }?.find {
-      it.type
-    }?.type
-
-    return platformHealthType ? platformHealthType : healthProviderNamesByPlatform[getCloudProvider(stage)]
+    // we want to make sure instances are down
+    // to prevent downstream stages (e.g. scaleDownCluster) from having to deal with disabled-but-instances-up server groups
+    // note that waitForRequiredInstancesDownTask knows how to deal with desiredPercentages, interestingHealthProviderNames, etc.
+    return !waitForRequiredInstancesDownTask.hasSucceeded(stage, targetServerGroup as Map, targetServerGroup.getInstances(), interestingHealthProviderNames)
   }
 }
diff --git a/...com/netflix/spinnaker/orca/clouddriver/tasks/cluster/WaitForClusterDisableTaskSpec.groovy b/...com/netflix/spinnaker/orca/clouddriver/tasks/cluster/WaitForClusterDisableTaskSpec.groovy
@@ -31,7 +31,7 @@ class WaitForClusterDisableTaskSpec extends Specification {
   @Subject def task = new WaitForClusterDisableTask([serverGroupCreator])
 
   @Unroll
-  def "status=#status when oldSGDisabled=#oldSGDisabled, desiredPercentage=#desiredPct, interestingHealthProviderNames=#interestingHealthProviderNames"() {
+  def "status=#status when desiredPercentage=#desiredPct, interestingHealthProviderNames=#interestingHealthProviderNames"() {
     given:
     def stage = stage {
       context = [
@@ -53,7 +53,7 @@ class WaitForClusterDisableTaskSpec extends Specification {
         serverGroup("$clusterName-v051".toString(), "us-west-1", [:]),
         serverGroup("$clusterName-$newServerGroup".toString(), region, [:]),
         serverGroup("$clusterName-$oldServerGroup".toString(), region, [
-          disabled: oldSGDisabled,
+          disabled: disabled,
           capacity: [desired: desired],
           instances: [
             instance('i-1', platformHealthState, extraHealths),
@@ -75,36 +75,37 @@ class WaitForClusterDisableTaskSpec extends Specification {
     result.getStatus() == status
 
     where:
-    dsgregion | minWaitTime | oldSGDisabled | desired | desiredPct | interestingHealthProviderNames | extraHealths                  | platformHealthState || status
-    "other"   | 0           | false         | 3       | null       | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED  // exercises if (!remainingDeployServerGroups)...
-    "other"   | 90          | false         | 3       | null       | ['platformHealthType']         | []                            | 'Unknown'           || RUNNING    // keeps running if duration < minWaitTime
+    dsgregion | minWaitTime | disabled | desired | desiredPct | interestingHealthProviderNames | extraHealths                  | platformHealthState || status
+    "other"   | 0           | false    | 3       | null       | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED  // exercises if (!remainingDeployServerGroups)...
+    "other"   | 90          | false    | 3       | null       | ['platformHealthType']         | []                            | 'Unknown'           || RUNNING    // keeps running if duration < minWaitTime
+    region    | 0           | false    | 3       | null       | null                           | []                            | 'Unknown'           || RUNNING    // keeps running if disabled is false
 
     // tests for isDisabled==true
-    region    | 0           | true          | 3       | null       | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED
-    region    | 0           | true          | 3       | null       | ['platformHealthType']         | []                            | 'NotUnknown'        || RUNNING    // wait for instances down even if cluster is disabled
-    region    | 0           | true          | 3       | 100        | ['platformHealthType']         | []                            | 'NotUnknown'        || RUNNING    // also wait for instances down with a desiredPct
-    region    | 0           | true          | 4       | 50         | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED
-    region    | 0           | true          | 3       | null       | ['strangeType']                | []                            | 'Unknown'           || SUCCEEDED  // intersection of interesting and provided healths is empty, so we're done
-    region    | 0           | true          | 3       | null       | ['strangeType']                | health('strange', 'Down')     | 'Unknown'           || SUCCEEDED  // also done if we provide it and are down...
-    region    | 0           | true          | 3       | null       | ['strangeType']                | health('strange', 'Up')       | 'Unknown'           || RUNNING    // ...but not if that extra health is up
-
-    // tests for isDisabled==false, no desiredPct
-    region    | 0           | false         | 3       | null       | []                             | []                            | 'Unknown'           || SUCCEEDED  // no health providers to check so short-circuits early
-    region    | 0           | false         | 3       | null       | null                           | []                            | 'Unknown'           || RUNNING    // exercises null vs empty behavior of interestingHealthProviderNames
-    region    | 0           | false         | 3       | null       | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED  // considered complete because only considers the platform health
-    region    | 0           | false         | 3       | null       | ['platformHealthType']         | []                            | 'Up'                || SUCCEEDED  // considered complete because only considers the platform health, despite platform health being Up
-    region    | 0           | false         | 3       | null       | ['strangeType']                | []                            | 'Unknown'           || RUNNING    // can't complete if we need to monitor an unknown health provider...
-    region    | 0           | false         | 3       | null       | ['strangeType']                | health('strange', 'Down')     | 'Unknown'           || RUNNING    // ...regardless of down status
-
-    // tests for waitForRequiredInstancesDownTask.hasSucceeded
-    region    | 0           | false         | 3       | 100        | null                           | []                            | 'Unknown'           || SUCCEEDED  // no other health providers than platform, and it looks down
-    region    | 0           | false         | 3       | 100        | null                           | []                            | 'NotUnknown'        || RUNNING    // no other health providers than platform, and it looks NOT down
-    region    | 0           | false         | 4       | 100        | ['platformHealthType']         | []                            | 'Unknown'           || RUNNING    // can't reach count(someAreDownAndNoneAreUp) >= targetDesiredSize
-    region    | 0           | false         | 4       | 50         | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED  // all look down, and we want at least 2 down so we're done
-    region    | 0           | false         | 3       | 100        | ['strangeType']                | []                            | 'Unknown'           || SUCCEEDED  // intersection of interesting and provided healths is empty, so we're done
-    region    | 0           | false         | 3       | 100        | ['strangeType']                | health('strange', 'Down')     | 'Unknown'           || SUCCEEDED  // ...unless we have data for that health provider
-    region    | 0           | false         | 3       | 100        | ['strangeType']                | health('strange', 'Up')       | 'Unknown'           || RUNNING    // ...unless we have data for that health provider
-    region    | 0           | false         | 3       | 100        | ['strangeType']                | health('strange', 'Starting') | 'Unknown'           || SUCCEEDED  // Starting is considered down
+    region    | 0           | true     | 3       | null       | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED
+    region    | 0           | true     | 3       | null       | ['platformHealthType']         | []                            | 'NotUnknown'        || RUNNING    // wait for instances down even if cluster is disabled
+    region    | 0           | true     | 3       | 100        | ['platformHealthType']         | []                            | 'NotUnknown'        || RUNNING    // also wait for instances down with a desiredPct
+    region    | 0           | true     | 4       | 50         | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED
+    region    | 0           | true     | 3       | null       | ['strangeType']                | []                            | 'Unknown'           || SUCCEEDED  // intersection of interesting and provided healths is empty, so we're done
+    region    | 0           | true     | 3       | null       | ['strangeType']                | health('strange', 'Down')     | 'Unknown'           || SUCCEEDED  // also done if we provide it and are down...
+    region    | 0           | true     | 3       | null       | ['strangeType']                | health('strange', 'Up')       | 'Unknown'           || RUNNING    // ...but not if that extra health is up
+
+    // tests for no desiredPct
+    region    | 0           | true     | 3       | null       | []                             | []                            | 'Unknown'           || SUCCEEDED  // no health providers to check so short-circuits early
+    region    | 0           | true     | 3       | null       | null                           | []                            | 'Unknown'           || SUCCEEDED  // exercises null vs empty behavior of interestingHealthProviderNames
+    region    | 0           | true     | 3       | null       | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED  // considered complete because only considers the platform health
+    region    | 0           | true     | 3       | null       | ['platformHealthType']         | health('eureka', 'Up')        | 'Unknown'           || SUCCEEDED  // up health is filtered out so we're done
+    region    | 0           | true     | 3       | null       | ['strangeType']                | []                            | 'Unknown'           || SUCCEEDED  // missing health is filtered out so we're done
+    region    | 0           | true     | 3       | null       | ['strangeType']                | health('strange', 'Down')     | 'Unknown'           || SUCCEEDED  // extra health is down so we're done
+
+    // tests with desiredPct
+    region    | 0           | true     | 3       | 100        | null                           | []                            | 'Unknown'           || SUCCEEDED  // no other health providers than platform
+    region    | 0           | true     | 3       | 100        | null                           | []                            | 'NotUnknown'        || RUNNING    // no other health providers than platform
+    region    | 0           | true     | 4       | 100        | ['platformHealthType']         | []                            | 'Unknown'           || RUNNING    // can't reach count(someAreDownAndNoneAreUp) >= targetDesiredSize
+    region    | 0           | true     | 4       | 50         | ['platformHealthType']         | []                            | 'Unknown'           || SUCCEEDED  // all look down, and we want at least 2 down so we're done
+    region    | 0           | true     | 3       | 100        | ['strangeType']                | []                            | 'Unknown'           || SUCCEEDED  // intersection of interesting and provided healths is empty, so we're done
+    region    | 0           | true     | 3       | 100        | ['strangeType']                | health('strange', 'Down')     | 'Unknown'           || SUCCEEDED  // extra health is down so we're done
+    region    | 0           | true     | 3       | 100        | ['strangeType']                | health('strange', 'Up')       | 'Unknown'           || RUNNING    // extra health is up so we're not done
+    region    | 0           | true     | 3       | 100        | ['strangeType']                | health('strange', 'Starting') | 'Unknown'           || SUCCEEDED  // Starting is considered down
 
     oldServerGroup = "v167"
     newServerGroup = "v168"