Skip to content

Commit

Permalink
feat(traffic guards): check traffic guards on terminate/disable insta…
Browse files Browse the repository at this point in the history
…nces
  • Loading branch information
anotherchrisberry committed Mar 21, 2017
1 parent 1f89e3b commit 53276a5
Show file tree
Hide file tree
Showing 5 changed files with 196 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ import com.netflix.spinnaker.orca.TaskResult
import com.netflix.spinnaker.orca.clouddriver.KatoService
import com.netflix.spinnaker.orca.clouddriver.pipeline.instance.TerminatingInstance
import com.netflix.spinnaker.orca.clouddriver.pipeline.instance.TerminatingInstanceSupport
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.Location
import com.netflix.spinnaker.orca.clouddriver.tasks.AbstractCloudProviderAwareTask
import com.netflix.spinnaker.orca.clouddriver.utils.TrafficGuard
import com.netflix.spinnaker.orca.pipeline.model.Stage
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.stereotype.Component
Expand All @@ -38,13 +40,23 @@ class TerminateInstanceAndDecrementServerGroupTask extends AbstractCloudProvider
@Autowired
TerminatingInstanceSupport instanceSupport

@Autowired
TrafficGuard trafficGuard

@Override
TaskResult execute(Stage stage) {
String cloudProvider = getCloudProvider(stage)
String account = getCredentials(stage)

List<TerminatingInstance> remainingInstances = instanceSupport.remainingInstances(stage)

trafficGuard.verifyInstanceTermination(
[stage.context.instance] as List<String>,
account,
Location.region(stage.context.region as String),
cloudProvider,
"Terminating the requested instance in ")

def taskId = kato.requestOperations(cloudProvider, [[(CLOUD_OPERATION_TYPE): stage.context]])
.toBlocking()
.first()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ import com.netflix.spinnaker.orca.clouddriver.KatoService
import com.netflix.spinnaker.orca.clouddriver.model.TaskId
import com.netflix.spinnaker.orca.clouddriver.pipeline.instance.TerminatingInstance
import com.netflix.spinnaker.orca.clouddriver.pipeline.instance.TerminatingInstanceSupport
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.Location
import com.netflix.spinnaker.orca.clouddriver.tasks.AbstractCloudProviderAwareTask
import com.netflix.spinnaker.orca.clouddriver.utils.TrafficGuard
import com.netflix.spinnaker.orca.pipeline.model.Stage
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.stereotype.Component
Expand All @@ -38,13 +40,23 @@ class TerminateInstancesTask extends AbstractCloudProviderAwareTask implements T
@Autowired
TerminatingInstanceSupport instanceSupport

@Autowired
TrafficGuard trafficGuard

@Override
TaskResult execute(Stage stage) {
String cloudProvider = getCloudProvider(stage)
String account = getCredentials(stage)

List<TerminatingInstance> remainingInstances = instanceSupport.remainingInstances(stage)

trafficGuard.verifyInstanceTermination(
stage.context.instanceIds as List<String>,
account,
Location.region(stage.context.region as String),
cloudProvider,
"Terminating the requested instances in")

TaskId taskId = kato.requestOperations(cloudProvider, [[terminateInstances: stage.context]])
.toBlocking()
.first()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
import org.springframework.stereotype.Component;
import retrofit.RetrofitError;

import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.*;
import java.util.stream.Collectors;

import static java.lang.String.format;

@Component
public class TrafficGuard {

Expand All @@ -44,19 +44,57 @@ public TrafficGuard(OortHelper oortHelper, Optional<Front50Service> front50Servi
this.front50Service = front50Service.orElse(null);
}

public void verifyInstanceTermination(List<String> instanceIds, String account, Location location, String cloudProvider, String operationDescriptor) {
Map<String, List<String>> instancesPerServerGroup = new HashMap<>();
instanceIds.forEach(instanceId -> {

Optional<String> resolvedServerGroupName = resolveServerGroupNameForInstance(instanceId, account, location.getValue(), cloudProvider);
resolvedServerGroupName.ifPresent(name -> instancesPerServerGroup.computeIfAbsent(name, serverGroup -> new ArrayList<>()).add(instanceId));
});

instancesPerServerGroup.entrySet().forEach(entry -> {
String serverGroupName = entry.getKey();
Names names = Names.parseName(serverGroupName);
if (hasDisableLock(names.getCluster(), account, location)) {
Optional<TargetServerGroup> targetServerGroup = oortHelper.getTargetServerGroup(account, serverGroupName, location.getValue(), cloudProvider);

targetServerGroup.ifPresent(serverGroup -> {
Optional<Map> thisInstance = serverGroup.getInstances().stream().filter(i -> "Up".equals(i.get("healthState"))).findFirst();
if (thisInstance.isPresent() && "Up".equals(thisInstance.get().get("healthState"))) {
long otherActiveInstances = serverGroup.getInstances().stream().filter(i -> "Up".equals(i.get("healthState")) && !entry.getValue().contains(i.get("name"))).count();
if (otherActiveInstances == 0) {
verifyOtherServerGroupsAreTakingTraffic(serverGroupName, location, account, cloudProvider, operationDescriptor);
}
}
});
}
});
}

private Optional<String> resolveServerGroupNameForInstance(String instanceId, String account, String region, String cloudProvider) {
List<Map> searchResults = (List<Map>) oortHelper.getSearchResults(instanceId, "instances", cloudProvider).get(0).getOrDefault("results", new ArrayList<>());
Optional<Map> instance = searchResults.stream().filter(r -> account.equals(r.get("account")) && region.equals(r.get("region"))).findFirst();
// instance not found, assume it's already terminated, what could go wrong
return Optional.ofNullable((String) instance.orElse(new HashMap<>()).get("serverGroup"));
}

public void verifyTrafficRemoval(String serverGroupName, String account, Location location, String cloudProvider, String operationDescriptor) {
Names names = Names.parseName(serverGroupName);

if (!hasDisableLock(names.getCluster(), account, location)) {
return;
}

verifyOtherServerGroupsAreTakingTraffic(serverGroupName, location, account, cloudProvider, operationDescriptor);
}

private void verifyOtherServerGroupsAreTakingTraffic(String serverGroupName, Location location, String account, String cloudProvider, String operationDescriptor) {
Names names = Names.parseName(serverGroupName);
Optional<Map> cluster = oortHelper.getCluster(names.getApp(), account, names.getCluster(), cloudProvider);

if (!cluster.isPresent()) {
throw new IllegalStateException("Could not find traffic-protected cluster.");
throw new IllegalStateException(format("Could not find cluster '%s' in %s/%s with traffic guard configured.", names.getCluster(), account, location.getValue()));
}

List<TargetServerGroup> targetServerGroups = ((List<Map<String, Object>>) cluster.get().get("serverGroups"))
.stream()
.map(TargetServerGroup::new)
Expand All @@ -65,12 +103,11 @@ public void verifyTrafficRemoval(String serverGroupName, String account, Locatio

boolean otherEnabledServerGroupFound = targetServerGroups.stream().anyMatch(tsg ->
!serverGroupName.equals(tsg.getName()) &&
!Boolean.TRUE.equals(tsg.isDisabled()) &&
(tsg.getInstances()).size() > 0
(tsg.getInstances().stream().filter(i -> "Up".equals(i.get("healthState"))).count()) > 0
);
if (!otherEnabledServerGroupFound) {
throw new IllegalStateException(String.format("This cluster has traffic protection enabled. " +
"%s %s would leave the cluster with no instances taking traffic.", operationDescriptor, serverGroupName));
throw new IllegalStateException(format("This cluster ('%s' in %s/%s) has traffic guards enabled. " +
"%s %s would leave the cluster with no instances taking traffic.", names.getCluster(), account, location.getValue(), operationDescriptor, serverGroupName));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,37 @@ import com.netflix.spinnaker.orca.ExecutionStatus
import com.netflix.spinnaker.orca.Task
import com.netflix.spinnaker.orca.TaskResult
import com.netflix.spinnaker.orca.clouddriver.KatoService
import com.netflix.spinnaker.orca.clouddriver.pipeline.instance.TerminatingInstanceSupport
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.Location
import com.netflix.spinnaker.orca.clouddriver.utils.CloudProviderAware
import com.netflix.spinnaker.orca.clouddriver.utils.HealthHelper
import com.netflix.spinnaker.orca.clouddriver.utils.OortHelper
import com.netflix.spinnaker.orca.clouddriver.utils.TrafficGuard
import com.netflix.spinnaker.orca.pipeline.model.Stage
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.stereotype.Component

@Component
class DisableInstancesTask implements Task {
class DisableInstancesTask implements CloudProviderAware, Task {

@Autowired KatoService katoService

@Autowired
TrafficGuard trafficGuard

@Override
TaskResult execute(Stage stage) {

String cloudProvider = getCloudProvider(stage)
String account = getCredentials(stage)

trafficGuard.verifyInstanceTermination(
stage.context.instanceIds as List<String>,
account,
Location.region(stage.context.region as String),
cloudProvider,
"Disabling the requested instances in")

def actions = [[disableInstancesInDiscovery: stage.context], [deregisterInstancesFromLoadBalancer: stage.context]]
def taskId = katoService.requestOperations(actions)
.toBlocking()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package com.netflix.spinnaker.orca.clouddriver.utils

import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.Location
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.Location.Type
import com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support.TargetServerGroup
import com.netflix.spinnaker.orca.front50.Front50Service
import com.netflix.spinnaker.orca.front50.model.Application
import retrofit.RetrofitError
Expand Down Expand Up @@ -127,7 +128,7 @@ class TrafficGuardSpec extends Specification {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
otherServerGroup.isDisabled = false
otherServerGroup.instances = [[id: 'a']]
otherServerGroup.instances = [[id: 'a', healthState: 'Up']]

when:
trafficGuard.verifyTrafficRemoval("app-foo-v001", "test", location, "aws", "x")
Expand All @@ -140,6 +141,23 @@ class TrafficGuardSpec extends Specification {
]
}

void "should throw if another server group is enabled but no instances are 'up'"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
otherServerGroup.isDisabled = false
otherServerGroup.instances = [[id: 'a', healthState: 'OutOfService']]

when:
trafficGuard.verifyTrafficRemoval("app-foo-v001", "test", location, "aws", "x")

then:
thrown(IllegalStateException)
1 * front50Service.get("app") >> application
1 * oortHelper.getCluster("app", "test", "app-foo", "aws") >> [
serverGroups: [targetServerGroup, otherServerGroup]
]
}

@Unroll
void "hasDisableLock should match on wildcards in stack, detail, account, location"() {
given:
Expand Down Expand Up @@ -209,6 +227,93 @@ class TrafficGuardSpec extends Specification {
1 * front50Service.get("app") >> application
}

void "instance termination should fail when last healthy instance in only server group in cluster"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
targetServerGroup.instances = [[name: "i-1", healthState: "Up"], [name: "i-2", healthState: "Down"]]
when:
trafficGuard.verifyInstanceTermination(["i-1"], "test", location, "aws", "x")

then:
thrown(IllegalStateException)
1 * front50Service.get("app") >> application
1 * oortHelper.getSearchResults("i-1", "instances", "aws") >> [ [results: [[account: "test", region: location.value, serverGroup: "app-foo-v001"]]]]
1 * oortHelper.getTargetServerGroup("test", "app-foo-v001", location.value, "aws") >> (targetServerGroup as TargetServerGroup)
1 * oortHelper.getCluster("app", "test", "app-foo", "aws") >> [
serverGroups: [targetServerGroup]
]
}

void "instance termination should fail when last healthy instance in only active server group in cluster"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
targetServerGroup.instances = [[name: "i-1", healthState: "Up"], [name: "i-2", healthState: "Down"]]
otherServerGroup.instances = [[name: "i-1", healthState: "Down"]]
when:
trafficGuard.verifyInstanceTermination(["i-1"], "test", location, "aws", "x")

then:
thrown(IllegalStateException)
1 * front50Service.get("app") >> application
1 * oortHelper.getSearchResults("i-1", "instances", "aws") >> [ [results: [[account: "test", region: location.value, serverGroup: "app-foo-v001"]]]]
1 * oortHelper.getTargetServerGroup("test", "app-foo-v001", location.value, "aws") >> (targetServerGroup as TargetServerGroup)
1 * oortHelper.getCluster("app", "test", "app-foo", "aws") >> [
serverGroups: [targetServerGroup, otherServerGroup]
]
}

void "instance termination should succeed when other server group in cluster contains healthy instance"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
targetServerGroup.instances = [[name: "i-1", healthState: "Up"], [name: "i-2", healthState: "Down"]]
otherServerGroup.instances = [[name: "i-1", healthState: "Up"]]
when:
trafficGuard.verifyInstanceTermination(["i-1"], "test", location, "aws", "x")

then:
notThrown(IllegalStateException)
1 * front50Service.get("app") >> application
1 * oortHelper.getSearchResults("i-1", "instances", "aws") >> [ [results: [[account: "test", region: location.value, serverGroup: "app-foo-v001"]]]]
1 * oortHelper.getTargetServerGroup("test", "app-foo-v001", location.value, "aws") >> (targetServerGroup as TargetServerGroup)
1 * oortHelper.getCluster("app", "test", "app-foo", "aws") >> [
serverGroups: [targetServerGroup, otherServerGroup]
]
}

void "instance termination should fail when trying to terminate all up instances in the cluster"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
targetServerGroup.instances = [[name: "i-1", healthState: "Up"], [name: "i-2", healthState: "Up"]]
otherServerGroup.instances = [[name: "i-1", healthState: "Down"]]
when:
trafficGuard.verifyInstanceTermination(["i-1", "i-2"], "test", location, "aws", "x")

then:
thrown(IllegalStateException)
1 * front50Service.get("app") >> application
1 * oortHelper.getSearchResults("i-1", "instances", "aws") >> [ [results: [[account: "test", region: location.value, serverGroup: "app-foo-v001"]]]]
1 * oortHelper.getSearchResults("i-2", "instances", "aws") >> [ [results: [[account: "test", region: location.value, serverGroup: "app-foo-v001"]]]]
1 * oortHelper.getTargetServerGroup("test", "app-foo-v001", location.value, "aws") >> (targetServerGroup as TargetServerGroup)
1 * oortHelper.getCluster("app", "test", "app-foo", "aws") >> [
serverGroups: [targetServerGroup, otherServerGroup]
]
}

void "instance termination should succeed when instance is not up, regardless of other instances"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
targetServerGroup.instances = [[name: "i-1"]]
when:
trafficGuard.verifyInstanceTermination(["i-1"], "test", location, "aws", "x")

then:
notThrown(IllegalStateException)
1 * front50Service.get("app") >> application
1 * oortHelper.getSearchResults("i-1", "instances", "aws") >> [ [results: [[account: "test", region: location.value, serverGroup: "app-foo-v001"]]]]
1 * oortHelper.getTargetServerGroup("test", "app-foo-v001", location.value, "aws") >> (targetServerGroup as TargetServerGroup)
0 * _
}

private void addGuard(Map guard) {
applicationDetails.putIfAbsent("trafficGuards", [])
applicationDetails.get("trafficGuards") << guard
Expand Down

0 comments on commit 53276a5

Please sign in to comment.