Skip to content

Commit

Permalink
feat(rollback): avoid scaling down during rollback
Browse files Browse the repository at this point in the history
Should partly address spinnaker/spinnaker#4895 by avoiding resizing the restore server group down.
This does not address concurrent actions issues, so it is probably still possible to have race conditions result in bad things.

So fundamentally this changes:
* computes the max of the current restore server group size and the rollback server group size
* only generates a resize stage if that computed size is bigger than the current size

This also opens the possibility to skip the snapshot/restoreMin tasks if we are not going to resize (which would pin the min of the restore server group).
  • Loading branch information
dreynaud committed Sep 23, 2019
1 parent 3bc32da commit 1d4f96b
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,20 @@ import com.netflix.spinnaker.orca.clouddriver.pipeline.providers.aws.CaptureSour
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.DisableServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.EnableServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.ResizeServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroup
import com.netflix.spinnaker.orca.clouddriver.utils.OortHelper
import com.netflix.spinnaker.orca.kato.pipeline.support.ResizeStrategy
import com.netflix.spinnaker.orca.pipeline.WaitStage
import com.netflix.spinnaker.orca.pipeline.model.Stage
import com.netflix.spinnaker.orca.pipeline.model.SyntheticStageOwner
import groovy.util.logging.Slf4j
import org.springframework.beans.factory.annotation.Autowired

import javax.annotation.Nullable

import static com.netflix.spinnaker.orca.pipeline.StageDefinitionBuilder.newStage

@Slf4j
class ExplicitRollback implements Rollback {
String rollbackServerGroupName
String restoreServerGroupName
Expand Down Expand Up @@ -61,6 +68,10 @@ class ExplicitRollback implements Rollback {
@JsonIgnore
WaitStage waitStage

@Autowired
@JsonIgnore
OortHelper oortHelper

@JsonIgnore
@Override
List<Stage> buildStages(Stage parentStage) {
Expand Down Expand Up @@ -92,29 +103,21 @@ class ExplicitRollback implements Rollback {
]
}

// https://github.com/spinnaker/spinnaker/issues/4895 - capture the source capacity as early as possible
def stages = []
if (!parentStage.getContext().containsKey("sourceServerGroupCapacitySnapshot")) {
// capacity has been previously captured (likely as part of a failed deploy), no need to do again!

def resizeStage = buildResizeStage(parentStage)

// only capture the source capacity (for unpinning) if we are going to resize
// if the capacity has been previously captured (e.g. as part of a failed deploy), no need to do it again
if (resizeStage != null && !parentStage.getContext().containsKey("sourceServerGroupCapacitySnapshot")) {
stages << buildCaptureSourceServerGroupCapacityStage(parentStage, parentStage.mapTo(ResizeStrategy.Source))
}

stages << enableServerGroupStage

Map resizeServerGroupContext = new HashMap(parentStage.context) + [
action : ResizeStrategy.ResizeAction.scale_to_server_group.toString(),
source : {
def source = parentStage.mapTo(ResizeStrategy.Source)
source.serverGroupName = rollbackServerGroupName
return source
}.call(),
asgName : restoreServerGroupName,
pinMinimumCapacity : true,
targetHealthyDeployPercentage: targetHealthyRollbackPercentage
]
stages << newStage(
parentStage.execution, resizeServerGroupStage.type, "resize", resizeServerGroupContext, parentStage, SyntheticStageOwner.STAGE_AFTER
)
if (resizeStage != null) {
stages << resizeStage
}

if (delayBeforeDisableSeconds != null && delayBeforeDisableSeconds > 0) {
def waitStage = newStage(
Expand All @@ -124,10 +127,73 @@ class ExplicitRollback implements Rollback {
}

stages << disableServerGroupStage
stages << buildApplySourceServerGroupCapacityStage(parentStage, parentStage.mapTo(ResizeStrategy.Source))

// only restore the min if it was pinned, i.e. there was a resize
if (resizeStage != null) {
stages << buildApplySourceServerGroupCapacityStage(parentStage, parentStage.mapTo(ResizeStrategy.Source))
}
return stages
}

@Nullable TargetServerGroup lookupServerGroup(Stage parentStage, String serverGroupName) {
def fromContext = parentStage.mapTo(ResizeStrategy.Source)

try {
// TODO: throw some retries in there?
return oortHelper.getTargetServerGroup(
fromContext.credentials,
serverGroupName,
fromContext.location
).orElse(null)
} catch(Exception e) {
log.error('Skipping resize stage because there was an error looking up {}', serverGroupName, e)
return null
}
}

@Nullable Stage buildResizeStage(Stage parentStage) {
TargetServerGroup rollbackServerGroup = lookupServerGroup(parentStage, rollbackServerGroupName)
if (!rollbackServerGroup) {
return null
}

TargetServerGroup restoreServerGroup = lookupServerGroup(parentStage, restoreServerGroupName)
if (!restoreServerGroup) {
return null
}

// we don't want to scale down restoreServerGroupName if rollbackServerGroupName is smaller for some reason
ResizeStrategy.Capacity newRestoreCapacity = [
max: Math.max(rollbackServerGroup.capacity.max, restoreServerGroup.capacity.max),
desired: Math.max(rollbackServerGroup.capacity.desired, restoreServerGroup.capacity.desired)
]

// let's directly produce a capacity with a pinned min instead of relying on the resize stage
newRestoreCapacity.min = newRestoreCapacity.desired

ResizeStrategy.Capacity currentCapacity = restoreServerGroup.capacity
if (currentCapacity == newRestoreCapacity) {
log.info('Skipping resize stage because the current capacity of the restore server group {} would be unchanged ({})',
restoreServerGroupName, newRestoreCapacity)
return null
}

Map resizeServerGroupContext = new HashMap(parentStage.context) + [
action : ResizeStrategy.ResizeAction.scale_exact.toString(),
capacity : newRestoreCapacity.asMap(),
asgName : restoreServerGroupName,
targetLocation : restoreServerGroup.getLocation(),
account : restoreServerGroup.credentials,
cloudProvider : restoreServerGroup.cloudProvider,
pinMinimumCapacity : true,
targetHealthyDeployPercentage: targetHealthyRollbackPercentage
]

return newStage(parentStage.execution, resizeServerGroupStage.type,
"Resize Restore Server Group to (min: ${newRestoreCapacity.min}, max: ${newRestoreCapacity.max}, desired: ${newRestoreCapacity.desired})",
resizeServerGroupContext, parentStage, SyntheticStageOwner.STAGE_AFTER)
}

Stage buildCaptureSourceServerGroupCapacityStage(Stage parentStage,
ResizeStrategy.Source source) {
Map captureSourceServerGroupCapacityContext = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class OortHelper {
Optional<TargetServerGroup> getTargetServerGroup(String account,
String serverGroupName,
String location,
String cloudProvider) {
String cloudProvider = null) {
return convertedResponse(Map) {
oortService.getServerGroup(account, location, serverGroupName)
}.map({ Map serverGroup -> new TargetServerGroup(serverGroup) })
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ interface ResizeStrategy {
Integer max
Integer desired
Integer min

// from https://codereview.stackexchange.com/questions/8801/returning-groovy-class-fields-as-a-map
public Map asMap() {
this.class.declaredFields.findAll { !it.synthetic }.collectEntries {
[ (it.name): this."$it.name" ]
}
}
}

@Canonical
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,34 +21,36 @@ import com.netflix.spinnaker.orca.clouddriver.pipeline.providers.aws.CaptureSour
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.DisableServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.EnableServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.ResizeServerGroupStage
import com.netflix.spinnaker.orca.kato.pipeline.support.ResizeStrategy
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.Location
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroup
import com.netflix.spinnaker.orca.clouddriver.utils.OortHelper
import com.netflix.spinnaker.orca.pipeline.WaitStage
import com.netflix.spinnaker.orca.pipeline.model.SyntheticStageOwner
import spock.lang.Shared
import spock.lang.Specification
import spock.lang.Subject
import spock.lang.Unroll

import static com.netflix.spinnaker.orca.test.model.ExecutionBuilder.stage;

class ExplicitRollbackSpec extends Specification {
@Shared
public static final String rollbackServerGroupName = "servergroup-v002"
public static final String restoreServerGroupName = "servergroup-v001"
def enableServerGroupStage = new EnableServerGroupStage()

@Shared
def disableServerGroupStage = new DisableServerGroupStage()

@Shared
def resizeServerGroupStage = new ResizeServerGroupStage()

@Shared
def captureSourceServerGroupCapacityStage = new CaptureSourceServerGroupCapacityStage()

@Shared
def applySourceServerGroupCapacityStage = new ApplySourceServerGroupCapacityStage()

@Shared
def waitStage = new WaitStage()
def oortHelper = Mock(OortHelper)

def stage = stage {
type = "rollbackServerGroup"
context = [
credentials : "test",
cloudProvider : "aws",
"region" : "us-west-1"
]
}

@Subject
def rollback = new ExplicitRollback(
Expand All @@ -57,30 +59,80 @@ class ExplicitRollbackSpec extends Specification {
resizeServerGroupStage: resizeServerGroupStage,
captureSourceServerGroupCapacityStage: captureSourceServerGroupCapacityStage,
applySourceServerGroupCapacityStage: applySourceServerGroupCapacityStage,
waitStage: waitStage
waitStage: waitStage,
oortHelper: oortHelper
)

def "should inject enable, resize and disable stages corresponding to the server group being restored and rollbacked"() {
given:
rollback.rollbackServerGroupName = "servergroup-v002"
rollback.restoreServerGroupName = "servergroup-v001"
def setup() {
rollback.rollbackServerGroupName = rollbackServerGroupName
rollback.restoreServerGroupName = restoreServerGroupName
rollback.targetHealthyRollbackPercentage = 95
}

def serverGroup(String name, int desired, Integer min = null, Integer max = null) {
return new TargetServerGroup([
serverGroupName: name,
capacity: [
min: min == null ? desired : min,
max: max == null ? desired : max,
desired: desired
],
region: "us-west-1",
credentials: "test",
cloudProvider: "aws",
])
}

@Unroll
def "should not have a resize stage if it would result in restoreServerGroup being scaled down"() {
when:
def allStages = rollback.buildStages(stage)
def afterStages = allStages.findAll { it.syntheticStageOwner == SyntheticStageOwner.STAGE_AFTER }

then:
1 * oortHelper.getTargetServerGroup(_, rollbackServerGroupName, _) >> Optional.of(serverGroup(rollbackServerGroupName, 2))
1 * oortHelper.getTargetServerGroup(_, restoreServerGroupName, _) >> Optional.of(serverGroup(restoreServerGroupName, restoreServerGroupCapacity))

def stage = stage {
type = "rollbackServerGroup"
context = [
credentials : "test",
cloudProvider : "aws",
"region" : "us-west-1"
]
}
afterStages*.type == expectedAfterStageTypes

where:
restoreServerGroupCapacity || expectedAfterStageTypes
1 || ["captureSourceServerGroupCapacity", "enableServerGroup", "resizeServerGroup", "disableServerGroup", "applySourceServerGroupCapacity"]
2 || ["enableServerGroup", "disableServerGroup"]
3 || ["enableServerGroup", "disableServerGroup"]
}

@Unroll
def "generated resize context for restoreServerGroup should #description"() {
when:
def allStages = rollback.buildStages(stage)
def afterStages = allStages.findAll { it.syntheticStageOwner == SyntheticStageOwner.STAGE_AFTER }

then:
1 * oortHelper.getTargetServerGroup(_, rollbackServerGroupName, _) >> Optional.of(serverGroup(rollbackServerGroupName, 2, 0, 4))
1 * oortHelper.getTargetServerGroup(_, restoreServerGroupName, _) >> Optional.of(serverGroup(restoreServerGroupName, restoreDesired, restoreMin, restoreMax))

afterStages[2].context.capacity == expectedCapacity

where:
restoreDesired | restoreMin | restoreMax || expectedCapacity || description
2 | 0 | 4 || [desired: 2, min: 2, max: 4] || 'pin the min'
1 | 0 | 4 || [desired: 2, min: 2, max: 4] || 'scale up to rollbackServerGroup.desired'
3 | 0 | 4 || [desired: 3, min: 3, max: 4] || 'not scale down to 2, just pin the min'
2 | 0 | 3 || [desired: 2, min: 2, max: 4] || 'inherit max from rollbackServerGroup'
2 | 0 | 5 || [desired: 2, min: 2, max: 5] || 'preserve its own max'
}

def "should inject enable, resize and disable stages corresponding to the server group being restored and rollbacked"() {
when:
def allStages = rollback.buildStages(stage)
def beforeStages = allStages.findAll { it.syntheticStageOwner == SyntheticStageOwner.STAGE_BEFORE }
def afterStages = allStages.findAll { it.syntheticStageOwner == SyntheticStageOwner.STAGE_AFTER }

then:
1 * oortHelper.getTargetServerGroup(_, rollbackServerGroupName, _) >> Optional.of(serverGroup(rollbackServerGroupName, 2))
1 * oortHelper.getTargetServerGroup(_, restoreServerGroupName, _) >> Optional.of(serverGroup(restoreServerGroupName, 1))

beforeStages.isEmpty()
afterStages*.type == [
"captureSourceServerGroupCapacity", // spinnaker/issues/4895, capture capacity snapshot first
Expand All @@ -91,33 +143,35 @@ class ExplicitRollbackSpec extends Specification {
]
afterStages[0].context == [
source : [
asgName : "servergroup-v002",
serverGroupName: "servergroup-v002",
asgName : rollbackServerGroupName,
serverGroupName: rollbackServerGroupName,
region : "us-west-1",
account : "test",
cloudProvider : "aws"
],
useSourceCapacity: true
]
afterStages[1].context == stage.context + [
serverGroupName: "servergroup-v001",
serverGroupName: restoreServerGroupName,
targetHealthyDeployPercentage: 95
]
afterStages[2].context == stage.context + [
action : "scale_to_server_group",
source : new ResizeStrategy.Source(null, null, "us-west-1", null, "servergroup-v002", "test", "aws"),
asgName : "servergroup-v001",
action : "scale_exact",
asgName : restoreServerGroupName,
pinMinimumCapacity: true,
targetHealthyDeployPercentage: 95
targetHealthyDeployPercentage: 95,
targetLocation: Location.region("us-west-1"),
capacity: [max:2, desired:2, min:2],
account: "test"
]
afterStages[3].context == stage.context + [
serverGroupName: "servergroup-v002"
serverGroupName: rollbackServerGroupName
]
afterStages[4].context == [
cloudProvider: "aws",
target : [
asgName : "servergroup-v001",
serverGroupName: "servergroup-v001",
asgName : restoreServerGroupName,
serverGroupName: restoreServerGroupName,
region : "us-west-1",
account : "test",
cloudProvider : "aws"
Expand All @@ -127,33 +181,22 @@ class ExplicitRollbackSpec extends Specification {
}

@Unroll
def "should inject wait stage before disable stage when 'delayBeforeDisableSeconds' > 0"() {
def "waitStageExpected=#waitStageExpected before disable stage when delayBeforeDisableSeconds=#delayBeforeDisableSeconds"() {
given:
rollback.rollbackServerGroupName = "servergroup-v002"
rollback.restoreServerGroupName = "servergroup-v001"
rollback.targetHealthyRollbackPercentage = 95
rollback.delayBeforeDisableSeconds = delayBeforeDisableSeconds

def stage = stage {
type = "rollbackServerGroup"
context = [
credentials : "test",
cloudProvider : "aws",
"region" : "us-west-1"
]
}

when:
def allStages = rollback.buildStages(stage)
def afterStages = allStages.findAll { it.syntheticStageOwner == SyntheticStageOwner.STAGE_AFTER }

then:
afterStages*.type == expectedAfterStageTypes
afterStages*.type.contains("wait") == waitStageExpected
afterStages*.type.indexOf("wait") < afterStages*.type.indexOf("disableServerGroup")

where:
delayBeforeDisableSeconds || expectedAfterStageTypes
null || ["captureSourceServerGroupCapacity", "enableServerGroup", "resizeServerGroup", "disableServerGroup", "applySourceServerGroupCapacity"]
0 || ["captureSourceServerGroupCapacity", "enableServerGroup", "resizeServerGroup", "disableServerGroup", "applySourceServerGroupCapacity"]
1 || ["captureSourceServerGroupCapacity", "enableServerGroup", "resizeServerGroup", "wait", "disableServerGroup", "applySourceServerGroupCapacity"]
delayBeforeDisableSeconds || waitStageExpected
null || false
0 || false
1 || true
}
}

0 comments on commit 1d4f96b

Please sign in to comment.