Skip to content

Commit

Permalink
fix(disable): enforce serverGroup.disabled is true (#3836)
Browse files Browse the repository at this point in the history
Let's simplify the logic in WaitForClusterDisableTask:
1. first make sure that the server group disabled flag is set
2. then delegate to waitForRequiredInstancesDownTask to check the various healths and the more subtle points like whether interestingHealthProviderNames are set or not

Motivation: it was possible to get out of WaitForClusterDisableTask with a server group that had Down instances but `disabled: false`, therefore it would still count as an active server group and would be filtered out by downstream stages like scaleDownServerGroup.
  • Loading branch information
dreynaud committed Jul 27, 2020
1 parent c47e530 commit f89aa30
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.Targe
import com.netflix.spinnaker.orca.clouddriver.tasks.servergroup.ServerGroupCreator
import com.netflix.spinnaker.orca.clouddriver.tasks.servergroup.WaitForRequiredInstancesDownTask
import com.netflix.spinnaker.orca.clouddriver.utils.CloudProviderAware
import com.netflix.spinnaker.orca.clouddriver.utils.HealthHelper
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.stereotype.Component
import org.springframework.beans.factory.annotation.Value

@Component
class WaitForClusterDisableTask extends AbstractWaitForClusterWideClouddriverTask implements CloudProviderAware {
// to make the logic more readable, we map true and false to words
private static final boolean RUNNING = true
private static final boolean COMPLETED = false

@Value('${tasks.disable-cluster-min-time-millis:90000}')
private int MINIMUM_WAIT_TIME_MS
Expand Down Expand Up @@ -68,33 +70,22 @@ class WaitForClusterDisableTask extends AbstractWaitForClusterWideClouddriverTas
// null vs empty interestingHealthProviderNames do mean very different things to Spinnaker
// a null value will result in Spinnaker waiting for discovery + platform, etc. whereas an empty will not wait for anything.
if (interestingHealthProviderNames != null && interestingHealthProviderNames.isEmpty()) {
return false
return COMPLETED
}

if (!serverGroup.isPresent()) {
return false
return COMPLETED
}

// Even if the server group is disabled, we want to make sure instances are down
// to prevent downstream stages (e.g. scaleDownCluster) from having to deal with disabled-but-instances-up server groups
// make sure that we wait for the disabled flag to be set
def targetServerGroup = serverGroup.get()
if (targetServerGroup.isDisabled() || stage.context.desiredPercentage) {
return !waitForRequiredInstancesDownTask.hasSucceeded(stage, targetServerGroup as Map, targetServerGroup.getInstances(), interestingHealthProviderNames)
if (!targetServerGroup.isDisabled()) {
return RUNNING
}

// TODO(lwander) investigate if the non-desiredPercentage/only-platform-health case code can be dropped in favor of waitForRequiredInstancesDownTask
// The operation can be considered complete if it was requested to only consider the platform health.
def platformHealthType = getPlatformHealthType(stage, targetServerGroup)
return !(platformHealthType && interestingHealthProviderNames == [platformHealthType])
}

private String getPlatformHealthType(StageExecution stage, TargetServerGroup targetServerGroup) {
def platformHealthType = targetServerGroup.instances.collect { instance ->
HealthHelper.findPlatformHealth(instance.health)
}?.find {
it.type
}?.type

return platformHealthType ? platformHealthType : healthProviderNamesByPlatform[getCloudProvider(stage)]
// we want to make sure instances are down
// to prevent downstream stages (e.g. scaleDownCluster) from having to deal with disabled-but-instances-up server groups
// note that waitForRequiredInstancesDownTask knows how to deal with desiredPercentages, interestingHealthProviderNames, etc.
return !waitForRequiredInstancesDownTask.hasSucceeded(stage, targetServerGroup as Map, targetServerGroup.getInstances(), interestingHealthProviderNames)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class WaitForClusterDisableTaskSpec extends Specification {
@Subject def task = new WaitForClusterDisableTask([serverGroupCreator])

@Unroll
def "status=#status when oldSGDisabled=#oldSGDisabled, desiredPercentage=#desiredPct, interestingHealthProviderNames=#interestingHealthProviderNames"() {
def "status=#status when desiredPercentage=#desiredPct, interestingHealthProviderNames=#interestingHealthProviderNames"() {
given:
def stage = stage {
context = [
Expand All @@ -53,7 +53,7 @@ class WaitForClusterDisableTaskSpec extends Specification {
serverGroup("$clusterName-v051".toString(), "us-west-1", [:]),
serverGroup("$clusterName-$newServerGroup".toString(), region, [:]),
serverGroup("$clusterName-$oldServerGroup".toString(), region, [
disabled: oldSGDisabled,
disabled: disabled,
capacity: [desired: desired],
instances: [
instance('i-1', platformHealthState, extraHealths),
Expand All @@ -75,36 +75,37 @@ class WaitForClusterDisableTaskSpec extends Specification {
result.getStatus() == status

where:
dsgregion | minWaitTime | oldSGDisabled | desired | desiredPct | interestingHealthProviderNames | extraHealths | platformHealthState || status
"other" | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // exercises if (!remainingDeployServerGroups)...
"other" | 90 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || RUNNING // keeps running if duration < minWaitTime
dsgregion | minWaitTime | disabled | desired | desiredPct | interestingHealthProviderNames | extraHealths | platformHealthState || status
"other" | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // exercises if (!remainingDeployServerGroups)...
"other" | 90 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || RUNNING // keeps running if duration < minWaitTime
region | 0 | false | 3 | null | null | [] | 'Unknown' || RUNNING // keeps running if disabled is false

// tests for isDisabled==true
region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED
region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // wait for instances down even if cluster is disabled
region | 0 | true | 3 | 100 | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // also wait for instances down with a desiredPct
region | 0 | true | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED
region | 0 | true | 3 | null | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done
region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // also done if we provide it and are down...
region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...but not if that extra health is up

// tests for isDisabled==false, no desiredPct
region | 0 | false | 3 | null | [] | [] | 'Unknown' || SUCCEEDED // no health providers to check so short-circuits early
region | 0 | false | 3 | null | null | [] | 'Unknown' || RUNNING // exercises null vs empty behavior of interestingHealthProviderNames
region | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // considered complete because only considers the platform health
region | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Up' || SUCCEEDED // considered complete because only considers the platform health, despite platform health being Up
region | 0 | false | 3 | null | ['strangeType'] | [] | 'Unknown' || RUNNING // can't complete if we need to monitor an unknown health provider...
region | 0 | false | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || RUNNING // ...regardless of down status

// tests for waitForRequiredInstancesDownTask.hasSucceeded
region | 0 | false | 3 | 100 | null | [] | 'Unknown' || SUCCEEDED // no other health providers than platform, and it looks down
region | 0 | false | 3 | 100 | null | [] | 'NotUnknown' || RUNNING // no other health providers than platform, and it looks NOT down
region | 0 | false | 4 | 100 | ['platformHealthType'] | [] | 'Unknown' || RUNNING // can't reach count(someAreDownAndNoneAreUp) >= targetDesiredSize
region | 0 | false | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // all look down, and we want at least 2 down so we're done
region | 0 | false | 3 | 100 | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done
region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // ...unless we have data for that health provider
region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...unless we have data for that health provider
region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Starting') | 'Unknown' || SUCCEEDED // Starting is considered down
region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED
region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // wait for instances down even if cluster is disabled
region | 0 | true | 3 | 100 | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // also wait for instances down with a desiredPct
region | 0 | true | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED
region | 0 | true | 3 | null | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done
region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // also done if we provide it and are down...
region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...but not if that extra health is up

// tests for no desiredPct
region | 0 | true | 3 | null | [] | [] | 'Unknown' || SUCCEEDED // no health providers to check so short-circuits early
region | 0 | true | 3 | null | null | [] | 'Unknown' || SUCCEEDED // exercises null vs empty behavior of interestingHealthProviderNames
region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // considered complete because only considers the platform health
region | 0 | true | 3 | null | ['platformHealthType'] | health('eureka', 'Up') | 'Unknown' || SUCCEEDED // up health is filtered out so we're done
region | 0 | true | 3 | null | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // missing health is filtered out so we're done
region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // extra health is down so we're done

// tests with desiredPct
region | 0 | true | 3 | 100 | null | [] | 'Unknown' || SUCCEEDED // no other health providers than platform
region | 0 | true | 3 | 100 | null | [] | 'NotUnknown' || RUNNING // no other health providers than platform
region | 0 | true | 4 | 100 | ['platformHealthType'] | [] | 'Unknown' || RUNNING // can't reach count(someAreDownAndNoneAreUp) >= targetDesiredSize
region | 0 | true | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // all look down, and we want at least 2 down so we're done
region | 0 | true | 3 | 100 | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done
region | 0 | true | 3 | 100 | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // extra health is down so we're done
region | 0 | true | 3 | 100 | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // extra health is up so we're not done
region | 0 | true | 3 | 100 | ['strangeType'] | health('strange', 'Starting') | 'Unknown' || SUCCEEDED // Starting is considered down

oldServerGroup = "v167"
newServerGroup = "v168"
Expand Down

0 comments on commit f89aa30

Please sign in to comment.