fix(monitored deploy): Give more leniency to scale down operations (#…

…3329) During a rolling deploy, if `scaleDown`, `notifyDeployCompleted`, or `shrinkCluster` fail and auto rollback was requested, a rollback will be performed. This is suboptimal because you can easily argue that these operations are non-critical. In fact at this point the deploy is completed (and the old ASG is disabled). So these are purely cleanup actions and having a perfectly good deploy fail and rollback due to a cleanup action failing is silly. This change makes the three stages return `FAILED_CONTINUE` instead `TERMINAL` on failure thus not triggering a rollback. The pipeline will still be marked as `FAILED_CONTINUE`
spinnaker · Dec 9, 2019 · e12a4a2 · e12a4a2
1 parent f8d95ff
commit e12a4a2
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 16 deletions.
diff --git a/...spinnaker/orca/clouddriver/pipeline/servergroup/strategies/MonitoredDeployStrategy.groovy b/...spinnaker/orca/clouddriver/pipeline/servergroup/strategies/MonitoredDeployStrategy.groovy
@@ -278,16 +278,21 @@ class MonitoredDeployStrategy implements Strategy {
       def scaleDown = baseContext + [
         allowScaleDownActive         : false,
         remainingFullSizeServerGroups: 1,
-        preferLargerOverNewer        : false
+        preferLargerOverNewer        : false,
+
       ]
-      stages << newStage(
+      Stage scaleDownStage = newStage(
         stage.execution,
         ScaleDownClusterStage.PIPELINE_CONFIG_TYPE,
         "scaleDown",
         scaleDown,
         stage,
         SyntheticStageOwner.STAGE_AFTER
       )
+
+      scaleDownStage.setAllowSiblingStagesToContinueOnFailure(true)
+      scaleDownStage.setContinuePipelineOnFailure(true)
+      stages << scaleDownStage
     }
 
     // Only unpin if we have a source ASG and we didn't scale it down
@@ -317,25 +322,33 @@ class MonitoredDeployStrategy implements Strategy {
         allowDeleteActive    : false,
         retainLargerOverNewer: false
       ]
-      stages << newStage(
+      Stage shrinkClusterStage = newStage(
         stage.execution,
         ShrinkClusterStage.STAGE_TYPE,
         "shrinkCluster",
         shrinkContext,
         stage,
         SyntheticStageOwner.STAGE_AFTER
       )
+
+      shrinkClusterStage.setAllowSiblingStagesToContinueOnFailure(true)
+      shrinkClusterStage.setContinuePipelineOnFailure(true)
+      stages << shrinkClusterStage
     }
 
     if (stageData.deploymentMonitor?.id) {
-      stages << newStage(
+      Stage notifyDeployCompletedStage = newStage(
         stage.execution,
         NotifyDeployCompletedStage.PIPELINE_CONFIG_TYPE,
         "Notify monitored deploy complete",
         evalContext + [hasDeploymentFailed: false],
         stage,
         SyntheticStageOwner.STAGE_AFTER
       )
+
+      notifyDeployCompletedStage.setAllowSiblingStagesToContinueOnFailure(true)
+      notifyDeployCompletedStage.setContinuePipelineOnFailure(true)
+      stages << notifyDeployCompletedStage
     }
 
     return stages

diff --git a/...com/netflix/spinnaker/orca/clouddriver/tasks/monitoreddeploy/MonitoredDeployBaseTask.java b/...com/netflix/spinnaker/orca/clouddriver/tasks/monitoreddeploy/MonitoredDeployBaseTask.java
@@ -22,6 +22,7 @@
 import com.netflix.spinnaker.orca.ExecutionStatus;
 import com.netflix.spinnaker.orca.RetryableTask;
 import com.netflix.spinnaker.orca.TaskResult;
+import com.netflix.spinnaker.orca.clouddriver.pipeline.monitoreddeploy.NotifyDeployCompletedStage;
 import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies.MonitoredDeployStageData;
 import com.netflix.spinnaker.orca.deploymentmonitor.DeploymentMonitorServiceProvider;
 import com.netflix.spinnaker.orca.deploymentmonitor.models.DeploymentStep;
@@ -188,6 +189,22 @@ private TaskResult handleError(
       }
     }
 
+    // Don't let failures in NotifyDeployComplete cause rollback
+    if (stage.getType().equalsIgnoreCase(NotifyDeployCompletedStage.PIPELINE_CONFIG_TYPE)) {
+      log.warn(
+          "Failed to get valid response for {} from deployment monitor {}, ignoring failure",
+          getClass().getSimpleName(),
+          monitorDefinition,
+          e);
+
+      String userMessage =
+          String.format(
+              "Failed to get a valid response from deployment monitor %s, proceeding anyway to avoid unnecessary rollback",
+              monitorDefinition.getName());
+
+      return buildTaskResult(TaskResult.builder(ExecutionStatus.FAILED_CONTINUE), userMessage);
+    }
+
     if (shouldFailOnError(stage, monitorDefinition)) {
       registry
           .counter("deploymentMonitor.fatalErrors", "monitorId", monitorDefinition.getId())

diff --git a/...naker/orca/clouddriver/pipeline/servergroup/strategies/MonitoredDeployStrategySpec.groovy b/...naker/orca/clouddriver/pipeline/servergroup/strategies/MonitoredDeployStrategySpec.groovy
@@ -17,6 +17,8 @@
 package com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies
 
 import com.netflix.spinnaker.config.DeploymentMonitorDefinition
+import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.ScaleDownClusterStage
+import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.ShrinkClusterStage
 import com.netflix.spinnaker.orca.deploymentmonitor.DeploymentMonitorServiceProvider
 import com.netflix.spinnaker.kork.web.exceptions.NotFoundException
 import com.netflix.spinnaker.moniker.Moniker
@@ -108,6 +110,28 @@ class MonitoredDeployStrategySpec extends Specification {
     afterStages[7].type == DisableServerGroupStage.PIPELINE_CONFIG_TYPE
     afterStages[7].context.desiredPercentage == 100
 
+    when: 'scale down is requested'
+    stage.context.scaleDown = true
+    afterStages = strategy.composeAfterStages(stage)
+    stage.context.scaleDown = false
+
+    then: 'adds a scale down stage'
+    afterStages.size() == 11
+    afterStages[9].type == ScaleDownClusterStage.PIPELINE_CONFIG_TYPE
+    afterStages[9].allowSiblingStagesToContinueOnFailure == true
+    afterStages[9].continuePipelineOnFailure == true
+
+    when: 'shrink is requested'
+    stage.context.maxRemainingAsgs = 2
+    afterStages = strategy.composeAfterStages(stage)
+    stage.context.remove("maxRemainingAsgs")
+
+    then: 'adds a scale shrink stage'
+    afterStages.size() == 12
+    afterStages[10].type == ShrinkClusterStage.STAGE_TYPE
+    afterStages[10].allowSiblingStagesToContinueOnFailure == true
+    afterStages[10].continuePipelineOnFailure == true
+
     when: 'no deployment monitor specified'
     stage.context.remove("deploymentMonitor")
     afterStages = strategy.composeAfterStages(stage)

diff --git a/orca-core/src/main/java/com/netflix/spinnaker/orca/pipeline/model/Stage.java b/orca-core/src/main/java/com/netflix/spinnaker/orca/pipeline/model/Stage.java
@@ -803,6 +803,17 @@ public void setAllowSiblingStagesToContinueOnFailure(boolean propagateFailuresTo
     context.put("allowSiblingStagesToContinueOnFailure", propagateFailuresToParent);
   }
 
+  @JsonIgnore
+  public void setContinuePipelineOnFailure(boolean continuePipeline) {
+    context.put("continuePipeline", continuePipeline);
+  }
+
+  @JsonIgnore
+  public boolean getContinuePipelineOnFailure() {
+    StageContext context = (StageContext) getContext();
+    return (boolean) context.getCurrentOnly("continuePipeline", false);
+  }
+
   public static class LastModifiedDetails implements Serializable {
     private String user;
 

diff --git a/...src/main/kotlin/com/netflix/spinnaker/orca/kayenta/pipeline/CleanupCanaryClustersStage.kt b/...src/main/kotlin/com/netflix/spinnaker/orca/kayenta/pipeline/CleanupCanaryClustersStage.kt
@@ -18,7 +18,6 @@ package com.netflix.spinnaker.orca.kayenta.pipeline
 
 import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.DisableClusterStage
 import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.ShrinkClusterStage
-import com.netflix.spinnaker.orca.ext.setContinueOnFailure
 import com.netflix.spinnaker.orca.kayenta.model.ServerGroupSpec
 import com.netflix.spinnaker.orca.kayenta.model.cluster
 import com.netflix.spinnaker.orca.kayenta.model.deployments
@@ -47,15 +46,15 @@ class CleanupCanaryClustersStage : StageDefinitionBuilder {
           it.context.putAll(pair.control.toContext)
           it.context["remainingEnabledServerGroups"] = 0
           // Even if disabling fails, move on to shrink below which will have another go at destroying the server group
-          it.setContinueOnFailure(true)
+          it.continuePipelineOnFailure = true
         },
         graph.add {
           it.type = DisableClusterStage.STAGE_TYPE
           it.name = "Disable experiment cluster ${pair.experiment.cluster}"
           it.context.putAll(pair.experiment.toContext)
           it.context["remainingEnabledServerGroups"] = 0
           // Even if disabling fails, move on to shrink below which will have another go at destroying the server group
-          it.setContinueOnFailure(true)
+          it.continuePipelineOnFailure = true
         }
       )
     }
@@ -82,7 +81,7 @@ class CleanupCanaryClustersStage : StageDefinitionBuilder {
         it.context.putAll(pair.control.toContext)
         it.context["allowDeleteActive"] = true
         it.context["shrinkToSize"] = 0
-        it.setContinueOnFailure(true)
+        it.continuePipelineOnFailure = true
       }
 
       // destroy experiment cluster
@@ -92,7 +91,7 @@ class CleanupCanaryClustersStage : StageDefinitionBuilder {
         it.context.putAll(pair.experiment.toContext)
         it.context["allowDeleteActive"] = true
         it.context["shrinkToSize"] = 0
-        it.setContinueOnFailure(true)
+        it.continuePipelineOnFailure = true
       }
     }
   }

diff --git a/orca-kotlin/src/main/kotlin/com/netflix/spinnaker/orca/ext/Stage.kt b/orca-kotlin/src/main/kotlin/com/netflix/spinnaker/orca/ext/Stage.kt
@@ -128,15 +128,9 @@ inline fun <reified O> Stage.mapTo(): O = mapTo(O::class.java)
 fun Stage.shouldFailPipeline(): Boolean =
   context["failPipeline"] in listOf(null, true)
 
-fun Stage.shouldContinueOnFailure(): Boolean =
-  context["continuePipeline"] == true
-
-fun Stage.setContinueOnFailure(continueOnFailure: Boolean) =
-  context.put("continuePipeline", continueOnFailure)
-
 fun Stage.failureStatus(default: ExecutionStatus = TERMINAL) =
   when {
-    shouldContinueOnFailure() -> FAILED_CONTINUE
+    continuePipelineOnFailure -> FAILED_CONTINUE
     shouldFailPipeline() -> default
     else -> STOPPED
   }