Skip to content

Commit

Permalink
feat(clouddriver): Favor target capacity when current desired == 0
Browse files Browse the repository at this point in the history
This handles a situation where we appear to be getting an occasional
stale min/max/desired when an asg id is re-used across deployments.

ie. app-v000 was deleted and re-created

Also reaching out to AWS for clarification as to whether this is
expected or not.
  • Loading branch information
ajordens committed Mar 13, 2019
1 parent f933bb7 commit be8a79a
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ import com.netflix.spinnaker.orca.clouddriver.utils.HealthHelper
import com.netflix.spinnaker.orca.clouddriver.utils.HealthHelper.HealthCountSnapshot
import com.netflix.spinnaker.orca.pipeline.model.Stage
import groovy.util.logging.Slf4j
import org.slf4j.MDC
import org.springframework.stereotype.Component

import java.util.concurrent.TimeUnit

@Component
@Slf4j
class WaitForUpInstancesTask extends AbstractWaitingForInstancesTask {
Expand Down Expand Up @@ -182,6 +185,20 @@ class WaitForUpInstancesTask extends AbstractWaitingForInstancesTask {

def cloudProvider = stage.context.cloudProvider

Optional<String> taskStartTime = Optional.ofNullable(MDC.get("taskStartTime"));
if (taskStartTime.isPresent()) {
if (System.currentTimeMillis() - TimeUnit.MINUTES.toMillis(10) > Long.valueOf(taskStartTime.get())) {
// expectation is reconciliation has happened within 10 minutes and that the
// current server group capacity should be preferred
log.error(
"Short circuiting initial target capacity determination after 10 minutes (serverGroup: {}, executionId: {})",
"${cloudProvider}:${serverGroup.region}:${serverGroup.name}",
stage.execution.id
)
return serverGroupCapacity
}
}

def initialTargetCapacity = getInitialTargetCapacity(stage, serverGroup)
if (!initialTargetCapacity) {
log.debug(
Expand All @@ -192,7 +209,8 @@ class WaitForUpInstancesTask extends AbstractWaitingForInstancesTask {
return serverGroupCapacity
}

if (serverGroup.capacity.max == 0 && initialTargetCapacity.max != 0) {
if ((serverGroup.capacity.max == 0 && initialTargetCapacity.max != 0) ||
(serverGroup.capacity.desired == 0 && initialTargetCapacity.desired > 0)) {
log.info(
"Overriding server group capacity (serverGroup: {}, initialTargetCapacity: {}, executionId: {})",
"${cloudProvider}:${serverGroup.region}:${serverGroup.name}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@ import com.netflix.spinnaker.orca.clouddriver.OortService
import com.netflix.spinnaker.orca.jackson.OrcaObjectMapper
import com.netflix.spinnaker.orca.pipeline.model.Execution
import com.netflix.spinnaker.orca.pipeline.model.Stage
import org.slf4j.MDC
import retrofit.client.Response
import retrofit.mime.TypedString
import spock.lang.Specification
import spock.lang.Subject
import spock.lang.Unroll

import java.util.concurrent.TimeUnit

import static com.netflix.spinnaker.orca.test.model.ExecutionBuilder.stage

class WaitForUpInstancesTaskSpec extends Specification {
Expand All @@ -40,6 +43,10 @@ class WaitForUpInstancesTaskSpec extends Specification {

def mapper = OrcaObjectMapper.newInstance()

void cleanup() {
MDC.clear()
}

void "should check cluster to get server groups"() {
given:
def pipeline = Execution.newPipeline("orca")
Expand Down Expand Up @@ -504,19 +511,28 @@ class WaitForUpInstancesTaskSpec extends Specification {

def serverGroup = [name: "app-v001", region: "us-west-2", capacity: serverGroupCapacity]

and:
MDC.put("taskStartTime", taskStartTime.toString())

expect:
WaitForUpInstancesTask.getServerGroupCapacity(stage, serverGroup) == expectedServerGroupCapacity

where:
katoTasks | serverGroupCapacity || expectedServerGroupCapacity
null | [min: 0, max: 0, desired: 0] || [min: 0, max: 0, desired: 0]
[[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | [min: 0, max: 0, desired: 0] || [min: 0, max: 1, desired: 1] // should take initial capacity b/c max = 0
[[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | [min: 0, max: 2, desired: 2] || [min: 0, max: 2, desired: 2] // should take current capacity b/c max > 0
katoTasks | taskStartTime | serverGroupCapacity || expectedServerGroupCapacity
null | startTime(0) | [min: 0, max: 0, desired: 0] || [min: 0, max: 0, desired: 0]
[[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | startTime(9) | [min: 0, max: 0, desired: 0] || [min: 0, max: 1, desired: 1] // should take initial capacity b/c max = 0
[[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | startTime(9) | [min: 0, max: 400, desired: 0] || [min: 0, max: 1, desired: 1] // should take initial capacity b/c desired = 0
[[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | startTime(9) | [min: 0, max: 2, desired: 2] || [min: 0, max: 2, desired: 2] // should take current capacity b/c max > 0
[[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | startTime(11) | [min: 0, max: 0, desired: 0] || [min: 0, max: 0, desired: 0] // should take current capacity b/c timeout
}

static Map deployment(String serverGroupName, String location, int min, int max, int desired) {
return [
serverGroupName: serverGroupName, location: location, capacity: [min: min, max: max, desired: desired]
]
}

static Long startTime(int minutesOld) {
return System.currentTimeMillis() - TimeUnit.MINUTES.toMillis(minutesOld)
}
}

0 comments on commit be8a79a

Please sign in to comment.