feat(orchestration): Allow sibling stages to continue on FAILED_CONTI…

…NUE (#3252) Normally, if a synthetic child stage fails with `FAILED_CONTINUE` the `CompleteStageHandler` will propagate the `FAILED_CONTINUE` status to the parent. This will prevent all subsequent sibling stages from executing. This change allows for an option (similar to Tasks) to continue execution even when a child stage has a `status=FAILED_CONTINUE` This option can only be set on a stage that is a synthetic child. This is needed for Monitored Deploy where we want to allow the deploy to proceed even if the monitoring fails
spinnaker · Oct 25, 2019 · ae0e2cf · ae0e2cf
1 parent 7f02cb8
commit ae0e2cf
Show file tree

Hide file tree

Showing 6 changed files with 122 additions and 3 deletions.
diff --git a/...spinnaker/orca/clouddriver/pipeline/servergroup/strategies/MonitoredDeployStrategy.groovy b/...spinnaker/orca/clouddriver/pipeline/servergroup/strategies/MonitoredDeployStrategy.groovy
@@ -15,7 +15,6 @@
  */
 package com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies
 
-import com.fasterxml.jackson.annotation.JsonProperty
 import com.netflix.spinnaker.config.DeploymentMonitorServiceProvider
 import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.RollbackClusterStage
 import com.netflix.spinnaker.orca.clouddriver.pipeline.cluster.ScaleDownClusterStage
@@ -181,6 +180,7 @@ class MonitoredDeployStrategy implements Strategy {
         stage,
         SyntheticStageOwner.STAGE_AFTER
       )
+      notifyDeployStartingStage.setAllowSiblingStagesToContinueOnFailure(true)
       stages << notifyDeployStartingStage
     } else {
       log.warn("No deployment monitor specified, all monitoring will be skipped")
@@ -246,14 +246,17 @@ class MonitoredDeployStrategy implements Strategy {
       if (stageData.deploymentMonitor?.id) {
         evalContext.currentProgress = p
 
-        stages << newStage(
+        Stage evaluateHealthStage = newStage(
           stage.execution,
           EvaluateDeploymentHealthStage.PIPELINE_CONFIG_TYPE,
           "Evaluate health of deployed instances",
           evalContext,
           stage,
           SyntheticStageOwner.STAGE_AFTER
         )
+        evaluateHealthStage.setAllowSiblingStagesToContinueOnFailure(true)
+
+        stages << evaluateHealthStage
       }
     })
 

diff --git a/orca-core/src/main/java/com/netflix/spinnaker/orca/pipeline/model/Stage.java b/orca-core/src/main/java/com/netflix/spinnaker/orca/pipeline/model/Stage.java
@@ -39,6 +39,7 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Sets;
+import com.netflix.spinnaker.kork.exceptions.SpinnakerException;
 import com.netflix.spinnaker.orca.ExecutionStatus;
 import com.netflix.spinnaker.orca.jackson.OrcaObjectMapper;
 import com.netflix.spinnaker.orca.pipeline.model.support.RequisiteStageRefIdDeserializer;
@@ -755,6 +756,38 @@ public Optional<Long> getTimeout() {
     return Optional.empty();
   }
 
+  /**
+   * Check if this stage should propagate FAILED_CONTINUE to parent stage. Normally, if a synthetic
+   * child fails with FAILED_CONTINUE {@link
+   * com.netflix.spinnaker.orca.q.handler.CompleteStageHandler} will propagate the FAILED_CONTINUE
+   * status to the parent, preventing all subsequent sibling stages from executing. This allows for
+   * an option (similar to Tasks) to continue execution if a child stage returns FAILED_CONTINUE
+   *
+   * @return true if we want to allow subsequent siblings to continue even if this stage returns
+   *     FAILED_CONTINUE
+   */
+  @JsonIgnore
+  public boolean getAllowSiblingStagesToContinueOnFailure() {
+    if (parentStageId == null) {
+      return false;
+    }
+
+    StageContext context = (StageContext) getContext();
+    return (boolean) context.getCurrentOnly("allowSiblingStagesToContinueOnFailure", false);
+  }
+
+  @JsonIgnore
+  public void setAllowSiblingStagesToContinueOnFailure(boolean propagateFailuresToParent) {
+    if (parentStageId == null) {
+      throw new SpinnakerException(
+          String.format(
+              "Not allowed to set propagateFailuresToParent on a non-child stage: %s with id %s",
+              getType(), getId()));
+    }
+
+    context.put("allowSiblingStagesToContinueOnFailure", propagateFailuresToParent);
+  }
+
   public static class LastModifiedDetails implements Serializable {
     private String user;
 

diff --git a/orca-core/src/main/java/com/netflix/spinnaker/orca/pipeline/model/StageContext.java b/orca-core/src/main/java/com/netflix/spinnaker/orca/pipeline/model/StageContext.java
@@ -64,6 +64,16 @@ public Object get(@Nullable Object key) {
     }
   }
 
+  /**
+   * Get a value from the current context ONLY - never looking at the ancestors' outputs
+   *
+   * @param key The key to look
+   * @param defaultValue default value to return if key is not present
+   * @return value or null if not present
+   */
+  Object getCurrentOnly(@Nullable Object key, Object defaultValue) {
+    return super.getOrDefault(key, defaultValue);
+  }
   /*
    * Gets all objects matching 'key', sorted by proximity to the current stage.
    * If the key exists in the current context, it will be the first element returned

diff --git a/orca-core/src/test/groovy/com/netflix/spinnaker/orca/pipeline/model/StageSpec.groovy b/orca-core/src/test/groovy/com/netflix/spinnaker/orca/pipeline/model/StageSpec.groovy
@@ -16,6 +16,7 @@
 
 package com.netflix.spinnaker.orca.pipeline.model
 
+import com.netflix.spinnaker.kork.exceptions.SpinnakerException
 import spock.lang.Specification
 import spock.lang.Unroll
 
@@ -318,4 +319,41 @@ class StageSpec extends Specification {
     100             | 200             | null            || 200
     100             | 200             | 300             || 300
   }
+
+  def "should set propagateFailuresToParent correctly"() {
+    given:
+    def pipeline = pipeline {
+      stage {
+        refId = "parent"
+
+        stage {
+          refId = "child"
+        }
+      }
+    }
+
+    def parentStage = pipeline.stageByRef("parent")
+    def childStage = pipeline.stageByRef("child")
+
+    when: 'trying to set PropagateFailuresToParent on parent stage'
+    parentStage.setAllowSiblingStagesToContinueOnFailure(true)
+
+    then: 'it should fail'
+    thrown(SpinnakerException)
+
+    when: 'parent stage erroneously has the setting in context'
+    parentStage = pipeline.stageByRef("parent")
+    parentStage.context.put("allowSiblingStagesToContinueOnFailure", true)
+
+    then: 'we ignore the value in context'
+    childStage.getAllowSiblingStagesToContinueOnFailure() == false
+
+    when: 'trying to set PropagateFailuresToParent on a child stage'
+    childStage.setAllowSiblingStagesToContinueOnFailure(true)
+
+    then: 'it should succeed'
+    noExceptionThrown()
+    childStage.context.allowSiblingStagesToContinueOnFailure == true
+    childStage.getAllowSiblingStagesToContinueOnFailure() == true
+  }
 }
diff --git a/orca-queue/src/main/kotlin/com/netflix/spinnaker/orca/q/handler/CompleteStageHandler.kt b/orca-queue/src/main/kotlin/com/netflix/spinnaker/orca/q/handler/CompleteStageHandler.kt
@@ -142,7 +142,7 @@ class CompleteStageHandler(
 
         // When a synthetic stage ends with FAILED_CONTINUE, propagate that status up to the stage's
         // parent so that no more of the parent's synthetic children will run.
-        if (stage.status == FAILED_CONTINUE && stage.syntheticStageOwner != null) {
+        if (stage.status == FAILED_CONTINUE && stage.syntheticStageOwner != null && !stage.allowSiblingStagesToContinueOnFailure) {
           queue.push(message.copy(stageId = stage.parentStageId!!))
         } else if (stage.status in listOf(SUCCEEDED, FAILED_CONTINUE, SKIPPED)) {
           stage.startNext()

diff --git a/orca-queue/src/test/kotlin/com/netflix/spinnaker/orca/q/handler/CompleteStageHandlerTest.kt b/orca-queue/src/test/kotlin/com/netflix/spinnaker/orca/q/handler/CompleteStageHandlerTest.kt
@@ -1078,6 +1078,41 @@ object CompleteStageHandlerTest : SubjectSpek<CompleteStageHandler>({
           verify(queue).push(message.copy(stageId = pipeline.stageByRef("1").id))
         }
       }
+
+      given("a synthetic stage's task ends with $FAILED_CONTINUE status and the synthetic allows siblings to continue") {
+        val pipeline = pipeline {
+          stage {
+            refId = "1"
+            type = stageWithSyntheticBefore.type
+            stageWithSyntheticBefore.buildBeforeStages(this)
+            stageWithSyntheticBefore.plan(this)
+          }
+        }
+
+        val syntheticStage = pipeline.stageByRef("1<1")
+        syntheticStage.allowSiblingStagesToContinueOnFailure = true
+        val message = CompleteStage(syntheticStage)
+
+        beforeGroup {
+          pipeline.stageById(message.stageId).apply {
+            status = RUNNING
+            singleTaskStage.plan(this)
+            tasks.first().status = FAILED_CONTINUE
+          }
+
+          whenever(repository.retrieve(PIPELINE, message.executionId)) doReturn pipeline
+        }
+
+        on("receiving the message") {
+          subject.handle(message)
+        }
+
+        afterGroup(::resetMocks)
+
+        it("starts executing the next sibling") {
+          verify(queue).push(StartStage(pipeline.stageByRef("1<2")))
+        }
+      }
     }
 
     setOf(TERMINAL, CANCELED).forEach { taskStatus ->