Skip to content

Commit

Permalink
feat(traffic-guards): bypass check for pinned server groups (#3507)
Browse files Browse the repository at this point in the history
* feat(traffic-guards): bypass check for pinned server groups

A common use case that trips up traffic guards is things like:
1. staging clusters with a fixed size (e.g. 3/3/3)
2. a deploy fails for some reason, leaving multiple server groups
3. from this point on, all deploys will fail the traffic guard capacity check
   (as you would delete N*3 instances and only leave 3 up)

It seems reasonable to punch an exception through traffic guards when this happens:
- there are multiple server groups going away, all with the same fixed size
- the action leaves _some_ instances up
  • Loading branch information
dreynaud committed Mar 13, 2020
1 parent c554d05 commit 2466ef0
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package com.netflix.spinnaker.orca.clouddriver.pipeline.servergroup.support;

import com.google.common.collect.ImmutableMap;
import java.util.Map;
import javax.annotation.Nonnegative;
import lombok.Value;

@Value
public class Capacity {
@Nonnegative int min;

@Nonnegative int max;

@Nonnegative int desired;

/**
* @return true if the capacity of this server group is fixed, i.e min, max and desired are all
* the same
*/
public boolean isPinned() {
return (max == desired) && (desired == min);
}

public Map<String, Integer> asMap() {
return ImmutableMap.of("min", min, "max", max, "desired", desired);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ class TargetServerGroup {
return serverGroup.name
}

Capacity getCapacity() {
return new Capacity(
toInt(serverGroup.capacity.min),
toInt(serverGroup.capacity.max),
toInt(serverGroup.capacity.desired))
}

private static int toInt(Object field) {
Integer.parseInt(field.toString())
}

/**
* Used in TrafficGuard, which is Java, which doesn't play nice with @Delegate
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,23 @@ public void verifyTrafficRemoval(
.reduce(0, Integer::sum);

int futureCapacity = currentCapacity - capacityGoingAway;

int someDesiredSize = someServerGroup.getCapacity().getDesired();
if (futureCapacity > 0
&& serverGroupsGoingAway.size() > 1
&& serverGroupsGoingAway.stream().allMatch(sg -> sg.getCapacity().isPinned())
&& serverGroupsGoingAway.stream()
.allMatch(sg -> sg.getCapacity().getDesired() == someDesiredSize)) {
log.debug(
"Bypassing traffic guard check for '{}' in {}/{} with pinned server groups of size {}. Context: {}",
cluster,
account,
location.getValue(),
someDesiredSize,
generateContext(currentServerGroups));
return;
}

double futureCapacityRatio = ((double) futureCapacity) / currentCapacity;
double minCapacityRatio = getMinCapacityRatio();
if (futureCapacityRatio <= minCapacityRatio) {
Expand Down Expand Up @@ -367,6 +384,7 @@ private List<Map> generateContext(Collection<TargetServerGroup> targetServerGrou
.put("name", tsg.getName())
.put("disabled", tsg.isDisabled())
.put("instances", tsg.getInstances())
.put("capacity", tsg.getCapacity())
.build())
.collect(Collectors.toList());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
/*
* Copyright 2015 Netflix, Inc.
* Copyright 2020 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ class TrafficGuardSpec extends Specification {
name: name,
moniker: MonikerHelper.friggaToMoniker(name),
isDisabled: false,
instances: [[healthState: 'Up']] * up + [[healthState: 'OutOfService']] * down
instances: [[healthState: 'Up']] * up + [[healthState: 'OutOfService']] * down,
capacity: [min: 0, max: 4, desired: 3]
] + overrides
}

Expand Down Expand Up @@ -201,6 +202,75 @@ class TrafficGuardSpec extends Specification {
1 * dynamicConfigService.getConfig(Double.class, TrafficGuard.MIN_CAPACITY_RATIO, 0d) >> 0.40d
}

void "should bypass capacity check for pinned server groups"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
List<TargetServerGroup> serverGroupsGoingAway =
[makeServerGroup("app-foo-v000", 3, 0, [capacity: [min: 3, max: 3, desired: 3]]) as TargetServerGroup,
makeServerGroup("app-foo-v001", 3, 0, [capacity: [min: 3, max: 3, desired: 3]]) as TargetServerGroup]

when:
trafficGuard.verifyTrafficRemoval(
serverGroupsGoingAway,
serverGroupsGoingAway + [makeServerGroup("app-foo-v002", 1) as TargetServerGroup],
"test", "x")

then:
notThrown(TrafficGuardException)
1 * front50Service.get("app") >> application
}

void "should still make sure that capacity does not drop to 0 for pinned server groups"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
List<TargetServerGroup> serverGroupsGoingAway =
[makeServerGroup("app-foo-v000", 3, 0, [capacity: [min: 3, max: 3, desired: 3]]) as TargetServerGroup,
makeServerGroup("app-foo-v001", 3, 0, [capacity: [min: 3, max: 3, desired: 3]]) as TargetServerGroup]

when:
trafficGuard.verifyTrafficRemoval(
serverGroupsGoingAway,
serverGroupsGoingAway + [makeServerGroup("app-foo-v002", 0) as TargetServerGroup],
"test", "x")

then:
def e = thrown(TrafficGuardException)
e.message.contains("would leave the cluster with no instances up")
1 * front50Service.get("app") >> application
}

@Unroll
def "should still apply capacity check when pinned server groups don't qualify"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])

when:
trafficGuard.verifyTrafficRemoval(
serverGroupsGoingAway,
serverGroupsGoingAway + [makeServerGroup("app-foo-v002", 1) as TargetServerGroup],
"test", "x")

then:
def e = thrown(TrafficGuardException)
e.message.contains("would leave the cluster with 1 instance up")
1 * front50Service.get("app") >> application
1 * dynamicConfigService.getConfig(Double.class, TrafficGuard.MIN_CAPACITY_RATIO, 0d) >> 0.4d

where:
serverGroupsGoingAway << [
// only one pinned server group going away
[makeServerGroup("app-foo-v000", 100, 0, [capacity: [min: 100, max: 100, desired: 100]]) as TargetServerGroup],

// only some of the server groups going away are pinned
[makeServerGroup("app-foo-v000", 10, 0, [capacity: [min: 10, max: 10, desired: 10]]) as TargetServerGroup,
makeServerGroup("app-foo-v001", 10, 0, [capacity: [min: 10, max: 100, desired: 10]]) as TargetServerGroup],

// the pinned server groups have different sizes
[makeServerGroup("app-foo-v000", 10, 0, [capacity: [min: 1, max: 1, desired: 1]]) as TargetServerGroup,
makeServerGroup("app-foo-v001", 10, 0, [capacity: [min: 100, max: 100, desired: 100]]) as TargetServerGroup]
]
}

void "should not throw exception during a regular shrink/disable cluster-wide operation"() {
given:
addGuard([account: "test", location: "us-east-1", stack: "foo"])
Expand Down

0 comments on commit 2466ef0

Please sign in to comment.