fix(RRB & MD): pin and unpin the correct server group (#3197)

During rolling redblack (or monitored deploy), use the determined source server group to pin/unpin. This used to live in the deploy preprocessor, but the deploy preprocessor runs too early to be useful (it runs before we run determineSourceServerGroup task). This means it can pick the wrong ASG to pin! in the following scenario: ``` ASG v2 - being deployed ASG v1 - Disabled ASG v0 - Enabled ``` before this change, we would pin/unpin `v1` which is clearly wrong ASG to pin. NOTE: currently the code is duplicated in RRB and MD strategies. I will do another PR with a refactor to make them share code (it's proving harder than anticipated to make this refactor actually nice)
spinnaker · Sep 27, 2019 · 717792b · 717792b
1 parent 4819162
commit 717792b
Show file tree

Hide file tree

Showing 5 changed files with 372 additions and 157 deletions.
diff --git a/...tflix/spinnaker/orca/clouddriver/pipeline/providers/aws/AwsDeployStagePreProcessor.groovy b/...tflix/spinnaker/orca/clouddriver/pipeline/providers/aws/AwsDeployStagePreProcessor.groovy
@@ -17,36 +17,21 @@
 
 package com.netflix.spinnaker.orca.clouddriver.pipeline.providers.aws
 
-import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.PinServerGroupStage
-import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies.AbstractDeployStrategyStage
 import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.strategies.DeployStagePreProcessor
-import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroup
-import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroupResolver
 import com.netflix.spinnaker.orca.kato.pipeline.strategy.Strategy
-import com.netflix.spinnaker.orca.kato.pipeline.support.ResizeStrategy
 import com.netflix.spinnaker.orca.kato.pipeline.support.StageData
 import com.netflix.spinnaker.orca.pipeline.CheckPreconditionsStage
 import com.netflix.spinnaker.orca.pipeline.model.Stage
 import org.springframework.beans.factory.annotation.Autowired
 import org.springframework.stereotype.Component
 
-import java.util.concurrent.TimeUnit
-
-import static com.netflix.spinnaker.orca.kato.pipeline.support.ResizeStrategySupport.getSource
-
 @Component
 class AwsDeployStagePreProcessor implements DeployStagePreProcessor {
   private static final List<Strategy> rollingStrategies = [Strategy.ROLLING_RED_BLACK, Strategy.MONITORED]
 
   @Autowired
   ApplySourceServerGroupCapacityStage applySourceServerGroupSnapshotStage
 
-  @Autowired
-  PinServerGroupStage pinServerGroupStage
-
-  @Autowired
-  TargetServerGroupResolver targetServerGroupResolver
-
   @Autowired
   CheckPreconditionsStage checkPreconditionsStage
 
@@ -93,24 +78,6 @@ class AwsDeployStagePreProcessor implements DeployStagePreProcessor {
       )
     }
 
-    if (shouldPinSourceServerGroup(stageData.strategy)) {
-      def optionalResizeContext = getResizeContext(stageData)
-      if (!optionalResizeContext.isPresent()) {
-        // this means we don't need to resize anything
-        // happens in particular when there is no pre-existing source server group
-        return stageDefinitions
-      }
-
-      def resizeContext = optionalResizeContext.get()
-      resizeContext.pinMinimumCapacity = true
-
-      stageDefinitions << new StageDefinition(
-        name: "Pin ${resizeContext.serverGroupName}",
-        stageDefinitionBuilder: pinServerGroupStage,
-        context: resizeContext
-      )
-    }
-
     return stageDefinitions
   }
 
@@ -129,24 +96,6 @@ class AwsDeployStagePreProcessor implements DeployStagePreProcessor {
       )
     }
 
-    def unpinServerGroupStage = buildUnpinServerGroupStage(stageData, false)
-    if (unpinServerGroupStage) {
-      stageDefinitions << unpinServerGroupStage
-    }
-
-    return stageDefinitions
-  }
-
-  @Override
-  List<StageDefinition> onFailureStageDefinitions(Stage stage) {
-    def stageData = stage.mapTo(StageData)
-    def stageDefinitions = []
-
-    def unpinServerGroupStage = buildUnpinServerGroupStage(stageData, true)
-    if (unpinServerGroupStage) {
-      stageDefinitions << unpinServerGroupStage
-    }
-
     return stageDefinitions
   }
 
@@ -156,74 +105,10 @@ class AwsDeployStagePreProcessor implements DeployStagePreProcessor {
     return stageData.cloudProvider == "aws" // && stageData.useSourceCapacity
   }
 
-  private static boolean shouldPinSourceServerGroup(String strategy) {
-    // TODO-AJ consciously only enabling for rolling red/black -- will add support for other strategies after it's working
-    return (rollingStrategies.contains(Strategy.fromStrategyKey(strategy)))
-  }
-
   private static boolean shouldCheckServerGroupsPreconditions(StageData stageData) {
     // TODO(dreynaud): enabling cautiously for RRB/MD only for testing, but we would ideally roll this out to other strategies
     Strategy strategy = Strategy.fromStrategyKey(stageData.strategy)
 
     return (rollingStrategies.contains(strategy) && (stageData.maxInitialAsgs != -1))
   }
-
-  private Optional<Map<String, Object>> getResizeContext(StageData stageData) {
-    def cleanupConfig = AbstractDeployStrategyStage.CleanupConfig.fromStage(stageData)
-    def baseContext = [
-      (cleanupConfig.location.singularType()): cleanupConfig.location.value,
-      cluster                                : cleanupConfig.cluster,
-      moniker                                : cleanupConfig.moniker,
-      credentials                            : cleanupConfig.account,
-      cloudProvider                          : cleanupConfig.cloudProvider,
-    ]
-
-    try {
-      def source = getSource(targetServerGroupResolver, stageData, baseContext)
-      if (!source) {
-        return Optional.empty()
-      }
-
-      baseContext.putAll([
-        serverGroupName   : source.serverGroupName,
-        action            : ResizeStrategy.ResizeAction.scale_to_server_group,
-        source            : source,
-        useNameAsLabel    : true     // hint to deck that it should _not_ override the name
-      ])
-      return Optional.of(baseContext)
-    } catch(TargetServerGroup.NotFoundException e) {
-      return Optional.empty()
-    }
-  }
-
-  private StageDefinition buildUnpinServerGroupStage(StageData stageData, boolean deployFailed) {
-    if (!shouldPinSourceServerGroup(stageData.strategy)) {
-      return null
-    }
-
-    if (stageData.scaleDown && !deployFailed) {
-      // source server group has been scaled down, no need to unpin if deploy was successful
-      return null
-    }
-
-    def optionalResizeContext = getResizeContext(stageData)
-    if (!optionalResizeContext.isPresent()) {
-      // no source server group, no need to unpin anything
-      return null
-    }
-
-    def resizeContext = optionalResizeContext.get()
-    resizeContext.unpinMinimumCapacity = true
-
-    if (deployFailed) {
-      // we want to specify a new timeout explicitly here, in case the deploy itself failed because of a timeout
-      resizeContext.stageTimeoutMs = TimeUnit.MINUTES.toMillis(20)
-    }
-
-    return new StageDefinition(
-      name: "Unpin ${resizeContext.serverGroupName} (deployFailed=${deployFailed})".toString(),
-      stageDefinitionBuilder: pinServerGroupStage,
-      context: resizeContext
-    )
-  }
 }
diff --git a/...naker/orca/clouddriver/pipeline/servergroup/strategies/AbstractDeployStrategyStage.groovy b/...naker/orca/clouddriver/pipeline/servergroup/strategies/AbstractDeployStrategyStage.groovy
@@ -165,19 +165,24 @@ abstract class AbstractDeployStrategyStage extends AbstractCloudProviderAwareSta
 
   @Override
   void onFailureStages(Stage stage, StageGraphBuilder graph) {
-    // TODO(mvulfson): Strategy on failure
+    Strategy strategy = getStrategy(stage)
+    // Strategy shouldn't ever be null during regular execution, but that's not the case for unit tests
+    // Either way, defensive programming
+    if (strategy != null) {
+      strategy.composeOnFailureStages(stage).forEach({ graph.append(it) })
+    }
 
     deployStagePreProcessors
       .findAll { it.supports(stage) }
       .collect { it.onFailureStageDefinitions(stage) }
       .flatten()
       .forEach { StageDefinition stageDefinition ->
-      graph.add {
-        it.type = stageDefinition.stageDefinitionBuilder.type
-        it.name = stageDefinition.name
-        it.context = stageDefinition.context
+        graph.add {
+          it.type = stageDefinition.stageDefinitionBuilder.type
+          it.name = stageDefinition.name
+          it.context = stageDefinition.context
+        }
       }
-    }
   }
 
   /**

diff --git a/...spinnaker/orca/clouddriver/pipeline/servergroup/strategies/MonitoredDeployStrategy.groovy b/...spinnaker/orca/clouddriver/pipeline/servergroup/strategies/MonitoredDeployStrategy.groovy
@@ -25,6 +25,7 @@ import com.netflix.spinnaker.orca.clouddriver.pipeline.monitoreddeploy.EvaluateD
 import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.CloneServerGroupStage
 import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.CreateServerGroupStage
 import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.DisableServerGroupStage
+import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.PinServerGroupStage
 import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.ResizeServerGroupStage
 import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.DetermineTargetServerGroupStage
 import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroup
@@ -38,6 +39,9 @@ import groovy.util.logging.Slf4j
 import org.springframework.beans.factory.annotation.Autowired
 import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty
 import org.springframework.stereotype.Component
+
+import java.util.concurrent.TimeUnit
+
 import static com.netflix.spinnaker.orca.pipeline.StageDefinitionBuilder.newStage
 import static com.netflix.spinnaker.orca.kato.pipeline.strategy.Strategy.MONITORED
 
@@ -129,22 +133,7 @@ class MonitoredDeployStrategy implements Strategy {
     def source = null
 
     try {
-      StageData.Source sourceServerGroup
-
-      Stage parentCreateServerGroupStage = stage.directAncestors()
-        .find() { it.type == CreateServerGroupStage.PIPELINE_CONFIG_TYPE || it.type == CloneServerGroupStage.PIPELINE_CONFIG_TYPE }
-
-      StageData parentStageData = parentCreateServerGroupStage.mapTo(StageData)
-      sourceServerGroup = parentStageData.source
-
-      if (sourceServerGroup != null && (sourceServerGroup.serverGroupName != null || sourceServerGroup.asgName != null)) {
-        source = new ResizeStrategy.Source(
-          region: sourceServerGroup.region,
-          serverGroupName: sourceServerGroup.serverGroupName ?: sourceServerGroup.asgName,
-          credentials: stageData.credentials ?: stageData.account,
-          cloudProvider: stageData.cloudProvider
-        )
-      }
+      source = lookupSourceServerGroup(stage)
     } catch (Exception e) {
       // This probably means there was no parent CreateServerGroup stage - which should never happen
       throw new IllegalStateException("Failed to determine source server group from parent stage while planning Monitored Deploy flow", e)
@@ -180,7 +169,23 @@ class MonitoredDeployStrategy implements Strategy {
     if (source == null) {
       log.info("no source server group -- will perform Monitored Deploy to exact fallback capacity $savedCapacity with no disableCluster or scaleDownCluster stages")
     } else {
-      // TODO(mvulfson): Pin Source
+      def resizeContext = baseContext
+      resizeContext.putAll([
+        serverGroupName   : source.serverGroupName,
+        action            : ResizeStrategy.ResizeAction.scale_to_server_group,
+        source            : source,
+        useNameAsLabel    : true,     // hint to deck that it should _not_ override the name
+        pinMinimumCapacity: true
+      ])
+
+      stages << newStage(
+        stage.execution,
+        PinServerGroupStage.TYPE,
+        "Pin ${resizeContext.serverGroupName}",
+        resizeContext,
+        stage,
+        SyntheticStageOwner.STAGE_AFTER
+      )
     }
 
     if (mdsd.deploymentMonitor.id) {
@@ -296,9 +301,25 @@ class MonitoredDeployStrategy implements Strategy {
       )
     }
 
-    // TODO(mvulfson): Unpin
-    if (source) {
+    // Only unpin if we have a source ASG and we didn't scale it down
+    if (source && !stageData.scaleDown) {
+      def resizeContext = baseContext
+      resizeContext.putAll([
+        serverGroupName     : source.serverGroupName,
+        action              : ResizeStrategy.ResizeAction.scale_to_server_group,
+        source              : source,
+        useNameAsLabel      : true,     // hint to deck that it should _not_ override the name
+        unpinMinimumCapacity: true
+      ])
 
+      stages << newStage(
+        stage.execution,
+        PinServerGroupStage.TYPE,
+        "Unpin ${resizeContext.serverGroupName}",
+        resizeContext,
+        stage,
+        SyntheticStageOwner.STAGE_AFTER
+      )
     }
 
     // TODO(mvulfson): Shrink cluster
@@ -317,6 +338,78 @@ class MonitoredDeployStrategy implements Strategy {
 
     return stages
   }
+
+  @Override
+  List<Stage> composeOnFailureStages(Stage parent) {
+    def source = null
+    def stages = []
+
+    try {
+      source = lookupSourceServerGroup(parent)
+    } catch (Exception e) {
+      log.warn("Failed to lookup source server group during composeOnFailureStages", e)
+    }
+
+    // No source, nothing to unpin
+    if (source == null) {
+      return stages
+    }
+
+    def cleanupConfig = AbstractDeployStrategyStage.CleanupConfig.fromStage(parent)
+
+    Map baseContext = [
+      (cleanupConfig.location.singularType()): cleanupConfig.location.value,
+      cluster                                : cleanupConfig.cluster,
+      moniker                                : cleanupConfig.moniker,
+      credentials                            : cleanupConfig.account,
+      cloudProvider                          : cleanupConfig.cloudProvider,
+    ]
+
+    def resizeContext = baseContext
+    resizeContext.putAll([
+      serverGroupName     : source.serverGroupName,
+      action              : ResizeStrategy.ResizeAction.scale_to_server_group,
+      source              : source,
+      useNameAsLabel      : true,     // hint to deck that it should _not_ override the name
+      unpinMinimumCapacity: true,
+      // we want to specify a new timeout explicitly here, in case the deploy itself failed because of a timeout
+      stageTimeoutMs      : TimeUnit.MINUTES.toMillis(20)
+    ])
+
+    stages << newStage(
+      parent.execution,
+      PinServerGroupStage.TYPE,
+      "Unpin ${resizeContext.serverGroupName}",
+      resizeContext,
+      parent,
+      SyntheticStageOwner.STAGE_AFTER
+    )
+
+    return stages
+  }
+
+  ResizeStrategy.Source lookupSourceServerGroup(Stage stage) {
+    ResizeStrategy.Source source = null
+    StageData.Source sourceServerGroup
+
+    Stage parentCreateServerGroupStage = stage.directAncestors()
+      .find() { it.type == CreateServerGroupStage.PIPELINE_CONFIG_TYPE || it.type == CloneServerGroupStage.PIPELINE_CONFIG_TYPE }
+
+    StageData parentStageData = parentCreateServerGroupStage.mapTo(StageData)
+    sourceServerGroup = parentStageData.source
+    MonitoredDeployStageData stageData = stage.mapTo(MonitoredDeployStageData)
+
+    if (sourceServerGroup != null && (sourceServerGroup.serverGroupName != null || sourceServerGroup.asgName != null)) {
+      source = new ResizeStrategy.Source(
+        region: sourceServerGroup.region,
+        serverGroupName: sourceServerGroup.serverGroupName ?: sourceServerGroup.asgName,
+        credentials: stageData.credentials ?: stageData.account,
+        cloudProvider: stageData.cloudProvider
+      )
+    }
+
+    return source
+  }
 }
 
 // TODO(mvulfson): Move this someplace nice