Skip to content

Commit

Permalink
fix(monitored deploy): Give more leniency to scale down operations (#…
Browse files Browse the repository at this point in the history
…3329)

During a rolling deploy, if `scaleDown`, `notifyDeployCompleted`, or `shrinkCluster` fail and auto rollback was requested, a rollback will be performed.
This is suboptimal because you can easily argue that these operations are non-critical. In fact at this point the deploy is completed (and the old ASG is disabled).
So these are purely cleanup actions and having a perfectly good deploy fail and rollback due to a cleanup action failing is silly.

This change makes the three stages return `FAILED_CONTINUE` instead `TERMINAL` on failure thus not triggering a rollback.
The pipeline will still be marked as `FAILED_CONTINUE`
  • Loading branch information
marchello2000 authored and mergify[bot] committed Dec 9, 2019
1 parent f8d95ff commit e12a4a2
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 16 deletions.
Expand Up @@ -278,16 +278,21 @@ class MonitoredDeployStrategy implements Strategy {
def scaleDown = baseContext + [
allowScaleDownActive : false,
remainingFullSizeServerGroups: 1,
preferLargerOverNewer : false
preferLargerOverNewer : false,

]
stages << newStage(
Stage scaleDownStage = newStage(
stage.execution,
ScaleDownClusterStage.PIPELINE_CONFIG_TYPE,
"scaleDown",
scaleDown,
stage,
SyntheticStageOwner.STAGE_AFTER
)

scaleDownStage.setAllowSiblingStagesToContinueOnFailure(true)
scaleDownStage.setContinuePipelineOnFailure(true)
stages << scaleDownStage
}

// Only unpin if we have a source ASG and we didn't scale it down
Expand Down Expand Up @@ -317,25 +322,33 @@ class MonitoredDeployStrategy implements Strategy {
allowDeleteActive : false,
retainLargerOverNewer: false
]
stages << newStage(
Stage shrinkClusterStage = newStage(
stage.execution,
ShrinkClusterStage.STAGE_TYPE,
"shrinkCluster",
shrinkContext,
stage,
SyntheticStageOwner.STAGE_AFTER
)

shrinkClusterStage.setAllowSiblingStagesToContinueOnFailure(true)
shrinkClusterStage.setContinuePipelineOnFailure(true)
stages << shrinkClusterStage
}

if (stageData.deploymentMonitor?.id) {
stages << newStage(
Stage notifyDeployCompletedStage = newStage(
stage.execution,
NotifyDeployCompletedStage.PIPELINE_CONFIG_TYPE,
"Notify monitored deploy complete",
evalContext + [hasDeploymentFailed: false],
stage,
SyntheticStageOwner.STAGE_AFTER
)

notifyDeployCompletedStage.setAllowSiblingStagesToContinueOnFailure(true)
notifyDeployCompletedStage.setContinuePipelineOnFailure(true)
stages << notifyDeployCompletedStage
}

return stages
Expand Down
Expand Up @@ -22,6 +22,7 @@
import com.netflix.spinnaker.orca.ExecutionStatus;
import com.netflix.spinnaker.orca.RetryableTask;
import com.netflix.spinnaker.orca.TaskResult;
import com.netflix.spinnaker.orca.clouddriver.pipeline.monitoreddeploy.NotifyDeployCompletedStage;
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies.MonitoredDeployStageData;
import com.netflix.spinnaker.orca.deploymentmonitor.DeploymentMonitorServiceProvider;
import com.netflix.spinnaker.orca.deploymentmonitor.models.DeploymentStep;
Expand Down Expand Up @@ -188,6 +189,22 @@ private TaskResult handleError(
}
}

// Don't let failures in NotifyDeployComplete cause rollback
if (stage.getType().equalsIgnoreCase(NotifyDeployCompletedStage.PIPELINE_CONFIG_TYPE)) {
log.warn(
"Failed to get valid response for {} from deployment monitor {}, ignoring failure",
getClass().getSimpleName(),
monitorDefinition,
e);

String userMessage =
String.format(
"Failed to get a valid response from deployment monitor %s, proceeding anyway to avoid unnecessary rollback",
monitorDefinition.getName());

return buildTaskResult(TaskResult.builder(ExecutionStatus.FAILED_CONTINUE), userMessage);
}

if (shouldFailOnError(stage, monitorDefinition)) {
registry
.counter("deploymentMonitor.fatalErrors", "monitorId", monitorDefinition.getId())
Expand Down
Expand Up @@ -17,6 +17,8 @@
package com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies

import com.netflix.spinnaker.config.DeploymentMonitorDefinition
import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.ScaleDownClusterStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.ShrinkClusterStage
import com.netflix.spinnaker.orca.deploymentmonitor.DeploymentMonitorServiceProvider
import com.netflix.spinnaker.kork.web.exceptions.NotFoundException
import com.netflix.spinnaker.moniker.Moniker
Expand Down Expand Up @@ -108,6 +110,28 @@ class MonitoredDeployStrategySpec extends Specification {
afterStages[7].type == DisableServerGroupStage.PIPELINE_CONFIG_TYPE
afterStages[7].context.desiredPercentage == 100

when: 'scale down is requested'
stage.context.scaleDown = true
afterStages = strategy.composeAfterStages(stage)
stage.context.scaleDown = false

then: 'adds a scale down stage'
afterStages.size() == 11
afterStages[9].type == ScaleDownClusterStage.PIPELINE_CONFIG_TYPE
afterStages[9].allowSiblingStagesToContinueOnFailure == true
afterStages[9].continuePipelineOnFailure == true

when: 'shrink is requested'
stage.context.maxRemainingAsgs = 2
afterStages = strategy.composeAfterStages(stage)
stage.context.remove("maxRemainingAsgs")

then: 'adds a scale shrink stage'
afterStages.size() == 12
afterStages[10].type == ShrinkClusterStage.STAGE_TYPE
afterStages[10].allowSiblingStagesToContinueOnFailure == true
afterStages[10].continuePipelineOnFailure == true

when: 'no deployment monitor specified'
stage.context.remove("deploymentMonitor")
afterStages = strategy.composeAfterStages(stage)
Expand Down
Expand Up @@ -803,6 +803,17 @@ public void setAllowSiblingStagesToContinueOnFailure(boolean propagateFailuresTo
context.put("allowSiblingStagesToContinueOnFailure", propagateFailuresToParent);
}

@JsonIgnore
public void setContinuePipelineOnFailure(boolean continuePipeline) {
context.put("continuePipeline", continuePipeline);
}

@JsonIgnore
public boolean getContinuePipelineOnFailure() {
StageContext context = (StageContext) getContext();
return (boolean) context.getCurrentOnly("continuePipeline", false);
}

public static class LastModifiedDetails implements Serializable {
private String user;

Expand Down
Expand Up @@ -18,7 +18,6 @@ package com.netflix.spinnaker.orca.kayenta.pipeline

import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.DisableClusterStage
import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.ShrinkClusterStage
import com.netflix.spinnaker.orca.ext.setContinueOnFailure
import com.netflix.spinnaker.orca.kayenta.model.ServerGroupSpec
import com.netflix.spinnaker.orca.kayenta.model.cluster
import com.netflix.spinnaker.orca.kayenta.model.deployments
Expand Down Expand Up @@ -47,15 +46,15 @@ class CleanupCanaryClustersStage : StageDefinitionBuilder {
it.context.putAll(pair.control.toContext)
it.context["remainingEnabledServerGroups"] = 0
// Even if disabling fails, move on to shrink below which will have another go at destroying the server group
it.setContinueOnFailure(true)
it.continuePipelineOnFailure = true
},
graph.add {
it.type = DisableClusterStage.STAGE_TYPE
it.name = "Disable experiment cluster ${pair.experiment.cluster}"
it.context.putAll(pair.experiment.toContext)
it.context["remainingEnabledServerGroups"] = 0
// Even if disabling fails, move on to shrink below which will have another go at destroying the server group
it.setContinueOnFailure(true)
it.continuePipelineOnFailure = true
}
)
}
Expand All @@ -82,7 +81,7 @@ class CleanupCanaryClustersStage : StageDefinitionBuilder {
it.context.putAll(pair.control.toContext)
it.context["allowDeleteActive"] = true
it.context["shrinkToSize"] = 0
it.setContinueOnFailure(true)
it.continuePipelineOnFailure = true
}

// destroy experiment cluster
Expand All @@ -92,7 +91,7 @@ class CleanupCanaryClustersStage : StageDefinitionBuilder {
it.context.putAll(pair.experiment.toContext)
it.context["allowDeleteActive"] = true
it.context["shrinkToSize"] = 0
it.setContinueOnFailure(true)
it.continuePipelineOnFailure = true
}
}
}
Expand Down
Expand Up @@ -128,15 +128,9 @@ inline fun <reified O> Stage.mapTo(): O = mapTo(O::class.java)
fun Stage.shouldFailPipeline(): Boolean =
context["failPipeline"] in listOf(null, true)

fun Stage.shouldContinueOnFailure(): Boolean =
context["continuePipeline"] == true

fun Stage.setContinueOnFailure(continueOnFailure: Boolean) =
context.put("continuePipeline", continueOnFailure)

fun Stage.failureStatus(default: ExecutionStatus = TERMINAL) =
when {
shouldContinueOnFailure() -> FAILED_CONTINUE
continuePipelineOnFailure -> FAILED_CONTINUE
shouldFailPipeline() -> default
else -> STOPPED
}
Expand Down

0 comments on commit e12a4a2

Please sign in to comment.