Skip to content

Commit

Permalink
fix(RRB & MD): pin and unpin the correct server group (#3197)
Browse files Browse the repository at this point in the history
During rolling redblack (or monitored deploy), use the determined source server group to pin/unpin.
This used to live in the deploy preprocessor, but the deploy preprocessor runs too early to be useful (it runs before we run determineSourceServerGroup task).
This means it can pick the wrong ASG to pin! in the following scenario:

```
ASG v2 - being deployed
ASG v1 - Disabled
ASG v0 - Enabled
```

before this change, we would pin/unpin `v1` which is clearly wrong ASG to pin.

NOTE: currently the code is duplicated in RRB and MD strategies. I will do another PR with a refactor to make them share code (it's proving harder than anticipated to make this refactor actually nice)
  • Loading branch information
marchello2000 authored Sep 27, 2019
1 parent 4819162 commit 717792b
Show file tree
Hide file tree
Showing 5 changed files with 372 additions and 157 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,36 +17,21 @@

package com.netflix.spinnaker.orca.clouddriver.pipeline.providers.aws

import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.PinServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies.AbstractDeployStrategyStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies.DeployStagePreProcessor
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroup
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroupResolver
import com.netflix.spinnaker.orca.kato.pipeline.strategy.Strategy
import com.netflix.spinnaker.orca.kato.pipeline.support.ResizeStrategy
import com.netflix.spinnaker.orca.kato.pipeline.support.StageData
import com.netflix.spinnaker.orca.pipeline.CheckPreconditionsStage
import com.netflix.spinnaker.orca.pipeline.model.Stage
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.stereotype.Component

import java.util.concurrent.TimeUnit

import static com.netflix.spinnaker.orca.kato.pipeline.support.ResizeStrategySupport.getSource

@Component
class AwsDeployStagePreProcessor implements DeployStagePreProcessor {
private static final List<Strategy> rollingStrategies = [Strategy.ROLLING_RED_BLACK, Strategy.MONITORED]

@Autowired
ApplySourceServerGroupCapacityStage applySourceServerGroupSnapshotStage

@Autowired
PinServerGroupStage pinServerGroupStage

@Autowired
TargetServerGroupResolver targetServerGroupResolver

@Autowired
CheckPreconditionsStage checkPreconditionsStage

Expand Down Expand Up @@ -93,24 +78,6 @@ class AwsDeployStagePreProcessor implements DeployStagePreProcessor {
)
}

if (shouldPinSourceServerGroup(stageData.strategy)) {
def optionalResizeContext = getResizeContext(stageData)
if (!optionalResizeContext.isPresent()) {
// this means we don't need to resize anything
// happens in particular when there is no pre-existing source server group
return stageDefinitions
}

def resizeContext = optionalResizeContext.get()
resizeContext.pinMinimumCapacity = true

stageDefinitions << new StageDefinition(
name: "Pin ${resizeContext.serverGroupName}",
stageDefinitionBuilder: pinServerGroupStage,
context: resizeContext
)
}

return stageDefinitions
}

Expand All @@ -129,24 +96,6 @@ class AwsDeployStagePreProcessor implements DeployStagePreProcessor {
)
}

def unpinServerGroupStage = buildUnpinServerGroupStage(stageData, false)
if (unpinServerGroupStage) {
stageDefinitions << unpinServerGroupStage
}

return stageDefinitions
}

@Override
List<StageDefinition> onFailureStageDefinitions(Stage stage) {
def stageData = stage.mapTo(StageData)
def stageDefinitions = []

def unpinServerGroupStage = buildUnpinServerGroupStage(stageData, true)
if (unpinServerGroupStage) {
stageDefinitions << unpinServerGroupStage
}

return stageDefinitions
}

Expand All @@ -156,74 +105,10 @@ class AwsDeployStagePreProcessor implements DeployStagePreProcessor {
return stageData.cloudProvider == "aws" // && stageData.useSourceCapacity
}

private static boolean shouldPinSourceServerGroup(String strategy) {
// TODO-AJ consciously only enabling for rolling red/black -- will add support for other strategies after it's working
return (rollingStrategies.contains(Strategy.fromStrategyKey(strategy)))
}

private static boolean shouldCheckServerGroupsPreconditions(StageData stageData) {
// TODO(dreynaud): enabling cautiously for RRB/MD only for testing, but we would ideally roll this out to other strategies
Strategy strategy = Strategy.fromStrategyKey(stageData.strategy)

return (rollingStrategies.contains(strategy) && (stageData.maxInitialAsgs != -1))
}

private Optional<Map<String, Object>> getResizeContext(StageData stageData) {
def cleanupConfig = AbstractDeployStrategyStage.CleanupConfig.fromStage(stageData)
def baseContext = [
(cleanupConfig.location.singularType()): cleanupConfig.location.value,
cluster : cleanupConfig.cluster,
moniker : cleanupConfig.moniker,
credentials : cleanupConfig.account,
cloudProvider : cleanupConfig.cloudProvider,
]

try {
def source = getSource(targetServerGroupResolver, stageData, baseContext)
if (!source) {
return Optional.empty()
}

baseContext.putAll([
serverGroupName : source.serverGroupName,
action : ResizeStrategy.ResizeAction.scale_to_server_group,
source : source,
useNameAsLabel : true // hint to deck that it should _not_ override the name
])
return Optional.of(baseContext)
} catch(TargetServerGroup.NotFoundException e) {
return Optional.empty()
}
}

private StageDefinition buildUnpinServerGroupStage(StageData stageData, boolean deployFailed) {
if (!shouldPinSourceServerGroup(stageData.strategy)) {
return null
}

if (stageData.scaleDown && !deployFailed) {
// source server group has been scaled down, no need to unpin if deploy was successful
return null
}

def optionalResizeContext = getResizeContext(stageData)
if (!optionalResizeContext.isPresent()) {
// no source server group, no need to unpin anything
return null
}

def resizeContext = optionalResizeContext.get()
resizeContext.unpinMinimumCapacity = true

if (deployFailed) {
// we want to specify a new timeout explicitly here, in case the deploy itself failed because of a timeout
resizeContext.stageTimeoutMs = TimeUnit.MINUTES.toMillis(20)
}

return new StageDefinition(
name: "Unpin ${resizeContext.serverGroupName} (deployFailed=${deployFailed})".toString(),
stageDefinitionBuilder: pinServerGroupStage,
context: resizeContext
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -165,19 +165,24 @@ abstract class AbstractDeployStrategyStage extends AbstractCloudProviderAwareSta

@Override
void onFailureStages(Stage stage, StageGraphBuilder graph) {
// TODO(mvulfson): Strategy on failure
Strategy strategy = getStrategy(stage)
// Strategy shouldn't ever be null during regular execution, but that's not the case for unit tests
// Either way, defensive programming
if (strategy != null) {
strategy.composeOnFailureStages(stage).forEach({ graph.append(it) })
}

deployStagePreProcessors
.findAll { it.supports(stage) }
.collect { it.onFailureStageDefinitions(stage) }
.flatten()
.forEach { StageDefinition stageDefinition ->
graph.add {
it.type = stageDefinition.stageDefinitionBuilder.type
it.name = stageDefinition.name
it.context = stageDefinition.context
graph.add {
it.type = stageDefinition.stageDefinitionBuilder.type
it.name = stageDefinition.name
it.context = stageDefinition.context
}
}
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import com.netflix.spinnaker.orca.clouddriver.pipeline.monitoreddeploy.EvaluateD
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.CloneServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.CreateServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.DisableServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.PinServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.ResizeServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.DetermineTargetServerGroupStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroup
Expand All @@ -38,6 +39,9 @@ import groovy.util.logging.Slf4j
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty
import org.springframework.stereotype.Component

import java.util.concurrent.TimeUnit

import static com.netflix.spinnaker.orca.pipeline.StageDefinitionBuilder.newStage
import static com.netflix.spinnaker.orca.kato.pipeline.strategy.Strategy.MONITORED

Expand Down Expand Up @@ -129,22 +133,7 @@ class MonitoredDeployStrategy implements Strategy {
def source = null

try {
StageData.Source sourceServerGroup

Stage parentCreateServerGroupStage = stage.directAncestors()
.find() { it.type == CreateServerGroupStage.PIPELINE_CONFIG_TYPE || it.type == CloneServerGroupStage.PIPELINE_CONFIG_TYPE }

StageData parentStageData = parentCreateServerGroupStage.mapTo(StageData)
sourceServerGroup = parentStageData.source

if (sourceServerGroup != null && (sourceServerGroup.serverGroupName != null || sourceServerGroup.asgName != null)) {
source = new ResizeStrategy.Source(
region: sourceServerGroup.region,
serverGroupName: sourceServerGroup.serverGroupName ?: sourceServerGroup.asgName,
credentials: stageData.credentials ?: stageData.account,
cloudProvider: stageData.cloudProvider
)
}
source = lookupSourceServerGroup(stage)
} catch (Exception e) {
// This probably means there was no parent CreateServerGroup stage - which should never happen
throw new IllegalStateException("Failed to determine source server group from parent stage while planning Monitored Deploy flow", e)
Expand Down Expand Up @@ -180,7 +169,23 @@ class MonitoredDeployStrategy implements Strategy {
if (source == null) {
log.info("no source server group -- will perform Monitored Deploy to exact fallback capacity $savedCapacity with no disableCluster or scaleDownCluster stages")
} else {
// TODO(mvulfson): Pin Source
def resizeContext = baseContext
resizeContext.putAll([
serverGroupName : source.serverGroupName,
action : ResizeStrategy.ResizeAction.scale_to_server_group,
source : source,
useNameAsLabel : true, // hint to deck that it should _not_ override the name
pinMinimumCapacity: true
])

stages << newStage(
stage.execution,
PinServerGroupStage.TYPE,
"Pin ${resizeContext.serverGroupName}",
resizeContext,
stage,
SyntheticStageOwner.STAGE_AFTER
)
}

if (mdsd.deploymentMonitor.id) {
Expand Down Expand Up @@ -296,9 +301,25 @@ class MonitoredDeployStrategy implements Strategy {
)
}

// TODO(mvulfson): Unpin
if (source) {
// Only unpin if we have a source ASG and we didn't scale it down
if (source && !stageData.scaleDown) {
def resizeContext = baseContext
resizeContext.putAll([
serverGroupName : source.serverGroupName,
action : ResizeStrategy.ResizeAction.scale_to_server_group,
source : source,
useNameAsLabel : true, // hint to deck that it should _not_ override the name
unpinMinimumCapacity: true
])

stages << newStage(
stage.execution,
PinServerGroupStage.TYPE,
"Unpin ${resizeContext.serverGroupName}",
resizeContext,
stage,
SyntheticStageOwner.STAGE_AFTER
)
}

// TODO(mvulfson): Shrink cluster
Expand All @@ -317,6 +338,78 @@ class MonitoredDeployStrategy implements Strategy {

return stages
}

@Override
List<Stage> composeOnFailureStages(Stage parent) {
def source = null
def stages = []

try {
source = lookupSourceServerGroup(parent)
} catch (Exception e) {
log.warn("Failed to lookup source server group during composeOnFailureStages", e)
}

// No source, nothing to unpin
if (source == null) {
return stages
}

def cleanupConfig = AbstractDeployStrategyStage.CleanupConfig.fromStage(parent)

Map baseContext = [
(cleanupConfig.location.singularType()): cleanupConfig.location.value,
cluster : cleanupConfig.cluster,
moniker : cleanupConfig.moniker,
credentials : cleanupConfig.account,
cloudProvider : cleanupConfig.cloudProvider,
]

def resizeContext = baseContext
resizeContext.putAll([
serverGroupName : source.serverGroupName,
action : ResizeStrategy.ResizeAction.scale_to_server_group,
source : source,
useNameAsLabel : true, // hint to deck that it should _not_ override the name
unpinMinimumCapacity: true,
// we want to specify a new timeout explicitly here, in case the deploy itself failed because of a timeout
stageTimeoutMs : TimeUnit.MINUTES.toMillis(20)
])

stages << newStage(
parent.execution,
PinServerGroupStage.TYPE,
"Unpin ${resizeContext.serverGroupName}",
resizeContext,
parent,
SyntheticStageOwner.STAGE_AFTER
)

return stages
}

ResizeStrategy.Source lookupSourceServerGroup(Stage stage) {
ResizeStrategy.Source source = null
StageData.Source sourceServerGroup

Stage parentCreateServerGroupStage = stage.directAncestors()
.find() { it.type == CreateServerGroupStage.PIPELINE_CONFIG_TYPE || it.type == CloneServerGroupStage.PIPELINE_CONFIG_TYPE }

StageData parentStageData = parentCreateServerGroupStage.mapTo(StageData)
sourceServerGroup = parentStageData.source
MonitoredDeployStageData stageData = stage.mapTo(MonitoredDeployStageData)

if (sourceServerGroup != null && (sourceServerGroup.serverGroupName != null || sourceServerGroup.asgName != null)) {
source = new ResizeStrategy.Source(
region: sourceServerGroup.region,
serverGroupName: sourceServerGroup.serverGroupName ?: sourceServerGroup.asgName,
credentials: stageData.credentials ?: stageData.account,
cloudProvider: stageData.cloudProvider
)
}

return source
}
}

// TODO(mvulfson): Move this someplace nice
Expand Down
Loading

0 comments on commit 717792b

Please sign in to comment.