From 8386dc94d9d9747ad09d3ab04ee511d4c65f38a4 Mon Sep 17 00:00:00 2001 From: Rob Zienert Date: Thu, 5 Apr 2018 13:05:55 -0700 Subject: [PATCH] fix(aws): Make deploy atomic operation more resilient to AWS failures (#2463) --- .../aws/deploy/AsgReferenceCopier.groovy | 21 +++- .../aws/deploy/AutoScalingWorker.groovy | 86 ++++++++++++-- .../deploy/AutoScalingWorkerUnitSpec.groovy | 111 +++++++++++++++++- 3 files changed, 199 insertions(+), 19 deletions(-) diff --git a/clouddriver-aws/src/main/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AsgReferenceCopier.groovy b/clouddriver-aws/src/main/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AsgReferenceCopier.groovy index fc34de6fd2a..787dfb6275b 100644 --- a/clouddriver-aws/src/main/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AsgReferenceCopier.groovy +++ b/clouddriver-aws/src/main/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AsgReferenceCopier.groovy @@ -16,15 +16,20 @@ package com.netflix.spinnaker.clouddriver.aws.deploy import com.amazonaws.services.autoscaling.AmazonAutoScaling -import com.amazonaws.services.autoscaling.model.* -import com.netflix.spinnaker.clouddriver.aws.deploy.scalingpolicy.ScalingPolicyCopier +import com.amazonaws.services.autoscaling.model.AlreadyExistsException +import com.amazonaws.services.autoscaling.model.DescribeScheduledActionsRequest +import com.amazonaws.services.autoscaling.model.DescribeScheduledActionsResult +import com.amazonaws.services.autoscaling.model.PutScheduledUpdateGroupActionRequest +import com.amazonaws.services.autoscaling.model.ScheduledUpdateGroupAction +import com.netflix.spinnaker.clouddriver.aws.model.AwsResultsRetriever import com.netflix.spinnaker.clouddriver.aws.security.AmazonClientProvider import com.netflix.spinnaker.clouddriver.aws.security.NetflixAmazonCredentials -import com.netflix.spinnaker.clouddriver.data.task.Task -import com.netflix.spinnaker.clouddriver.aws.model.AwsResultsRetriever import com.netflix.spinnaker.clouddriver.aws.services.IdGenerator +import com.netflix.spinnaker.clouddriver.data.task.Task import groovy.transform.Canonical +import groovy.util.logging.Slf4j +@Slf4j @Canonical class AsgReferenceCopier { @@ -59,7 +64,13 @@ class AsgReferenceCopier { if (startTime?.time > System.currentTimeMillis()) { request.withStartTime(startTime) } - targetAutoScaling.putScheduledUpdateGroupAction(request) + + try { + targetAutoScaling.putScheduledUpdateGroupAction(request) + } catch (AlreadyExistsException e) { + // This should never happen as the name is generated with a UUID. + log.warn("Scheduled action already exists on ASG, continuing (request: $request)") + } task.updateStatus "AWS_DEPLOY", "Creating scheduled action (${request}) on ${targetRegion}/${targetAsgName} from ${sourceRegion}/${sourceAsgName}..." } diff --git a/clouddriver-aws/src/main/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AutoScalingWorker.groovy b/clouddriver-aws/src/main/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AutoScalingWorker.groovy index 5d993461986..ebaa78e69ab 100644 --- a/clouddriver-aws/src/main/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AutoScalingWorker.groovy +++ b/clouddriver-aws/src/main/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AutoScalingWorker.groovy @@ -15,28 +15,40 @@ */ package com.netflix.spinnaker.clouddriver.aws.deploy + +import com.amazonaws.services.autoscaling.AmazonAutoScaling +import com.amazonaws.services.autoscaling.model.AlreadyExistsException +import com.amazonaws.services.autoscaling.model.AutoScalingGroup import com.amazonaws.services.autoscaling.model.CreateAutoScalingGroupRequest +import com.amazonaws.services.autoscaling.model.DescribeAutoScalingGroupsRequest +import com.amazonaws.services.autoscaling.model.DescribeAutoScalingGroupsResult import com.amazonaws.services.autoscaling.model.EnableMetricsCollectionRequest import com.amazonaws.services.autoscaling.model.SuspendProcessesRequest import com.amazonaws.services.autoscaling.model.Tag import com.amazonaws.services.autoscaling.model.UpdateAutoScalingGroupRequest import com.amazonaws.services.ec2.model.DescribeSubnetsResult import com.amazonaws.services.ec2.model.Subnet -import com.netflix.spinnaker.clouddriver.aws.security.NetflixAmazonCredentials -import com.netflix.spinnaker.clouddriver.data.task.Task -import com.netflix.spinnaker.clouddriver.data.task.TaskRepository import com.netflix.spinnaker.clouddriver.aws.model.AmazonBlockDevice import com.netflix.spinnaker.clouddriver.aws.model.AutoScalingProcessType import com.netflix.spinnaker.clouddriver.aws.model.SubnetData import com.netflix.spinnaker.clouddriver.aws.model.SubnetTarget +import com.netflix.spinnaker.clouddriver.aws.security.NetflixAmazonCredentials import com.netflix.spinnaker.clouddriver.aws.services.RegionScopedProviderFactory +import com.netflix.spinnaker.clouddriver.data.task.Task +import com.netflix.spinnaker.clouddriver.data.task.TaskRepository import com.netflix.spinnaker.kork.core.RetrySupport +import groovy.util.logging.Slf4j + +import java.time.Instant +import java.time.temporal.ChronoUnit +import java.util.function.Supplier /** * A worker class dedicated to the deployment of "applications", following many of Netflix's common AWS conventions. * * */ +@Slf4j class AutoScalingWorker { private static final String AWS_PHASE = "AWS_DEPLOY" @@ -147,7 +159,7 @@ class AutoScalingWorker { String launchConfigName = regionScopedProvider.getLaunchConfigurationBuilder().buildLaunchConfiguration(application, subnetType, settings, legacyUdf) - task.updateStatus AWS_PHASE, "Deploying ASG." + task.updateStatus AWS_PHASE, "Deploying ASG: $asgName" createAutoScalingGroup(asgName, launchConfigName) } @@ -232,17 +244,33 @@ class AutoScalingWorker { } def autoScaling = regionScopedProvider.autoScaling - retrySupport.retry({ -> autoScaling.createAutoScalingGroup(request) }, 5, 1000, true) + Exception ex = retrySupport.retry({ -> + try { + autoScaling.createAutoScalingGroup(request) + } catch (AlreadyExistsException e) { + if (!shouldProceedWithExistingState(autoScaling, asgName, request)) { + return e + } + log.debug("Determined pre-existing ASG is desired state, continuing...", e) + } + }, 10, 1000, false) + if (ex != null) { + throw ex + } if (suspendedProcesses) { - autoScaling.suspendProcesses(new SuspendProcessesRequest(autoScalingGroupName: asgName, scalingProcesses: suspendedProcesses)) + retrySupport.retry({ -> + autoScaling.suspendProcesses(new SuspendProcessesRequest(autoScalingGroupName: asgName, scalingProcesses: suspendedProcesses)) + }, 10, 1000, false) } if (enabledMetrics && instanceMonitoring) { task.updateStatus AWS_PHASE, "Enabling metrics collection for: $asgName" - autoScaling.enableMetricsCollection(new EnableMetricsCollectionRequest() - .withAutoScalingGroupName(asgName) - .withGranularity('1Minute') - .withMetrics(enabledMetrics)) + retrySupport.retry({ -> + autoScaling.enableMetricsCollection(new EnableMetricsCollectionRequest() + .withAutoScalingGroupName(asgName) + .withGranularity('1Minute') + .withMetrics(enabledMetrics)) + }, 10, 1000, false) } retrySupport.retry({ -> @@ -254,8 +282,44 @@ class AutoScalingWorker { desiredCapacity: desiredInstances ) ) - }, 5, 1000, true) + }, 10, 1000, false) asgName } + + private boolean shouldProceedWithExistingState(AmazonAutoScaling autoScaling, String asgName, CreateAutoScalingGroupRequest request) { + DescribeAutoScalingGroupsResult result = autoScaling.describeAutoScalingGroups( + new DescribeAutoScalingGroupsRequest().withAutoScalingGroupNames(asgName) + ) + if (result.autoScalingGroups.isEmpty()) { + // This will only happen if we get an AlreadyExistsException from AWS, then immediately after describing it, we + // don't get a result back. We'll continue with trying to create because who knows... may as well try. + log.error("Attempted to find pre-existing ASG but none was found: $asgName") + return true + } + AutoScalingGroup existingAsg = result.autoScalingGroups.first() + + Set failedPredicates = [ + "launch configuration": { return existingAsg.launchConfigurationName == request.launchConfigurationName }, + "availability zones": { return existingAsg.availabilityZones == request.availabilityZones }, + "subnets": { return existingAsg.getVPCZoneIdentifier() == request.getVPCZoneIdentifier() }, + "load balancers": { return existingAsg.loadBalancerNames == request.loadBalancerNames }, + "target groups": { return existingAsg.targetGroupARNs == request.targetGroupARNs }, + "cooldown": { return existingAsg.defaultCooldown == request.defaultCooldown }, + "health check grace period": { return existingAsg.healthCheckGracePeriod == request.healthCheckGracePeriod }, + "health check type": { return existingAsg.healthCheckType == request.healthCheckType }, + "termination policies": { return existingAsg.terminationPolicies == request.terminationPolicies } + ].findAll { !((Supplier) it.value).get() }.keySet() + + if (!failedPredicates.isEmpty()) { + task.updateStatus AWS_PHASE, "$asgName already exists and does not seem to match desired state on: ${failedPredicates.join(", ")}" + return false + } + if (existingAsg.createdTime.toInstant().isBefore(Instant.now().minus(1, ChronoUnit.HOURS))) { + task.updateStatus AWS_PHASE, "$asgName already exists and appears to be valid, but falls outside of window for deploy retry" + return false + } + + return true + } } diff --git a/clouddriver-aws/src/test/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AutoScalingWorkerUnitSpec.groovy b/clouddriver-aws/src/test/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AutoScalingWorkerUnitSpec.groovy index b526fc42192..7d403a26365 100644 --- a/clouddriver-aws/src/test/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AutoScalingWorkerUnitSpec.groovy +++ b/clouddriver-aws/src/test/groovy/com/netflix/spinnaker/clouddriver/aws/deploy/AutoScalingWorkerUnitSpec.groovy @@ -17,15 +17,17 @@ package com.netflix.spinnaker.clouddriver.aws.deploy import com.amazonaws.services.autoscaling.AmazonAutoScaling +import com.amazonaws.services.autoscaling.model.AlreadyExistsException import com.amazonaws.services.autoscaling.model.AutoScalingGroup +import com.amazonaws.services.autoscaling.model.DescribeAutoScalingGroupsResult import com.amazonaws.services.ec2.AmazonEC2 import com.amazonaws.services.ec2.model.Subnet -import com.netflix.spinnaker.clouddriver.data.task.DefaultTask -import com.netflix.spinnaker.clouddriver.data.task.Task -import com.netflix.spinnaker.clouddriver.data.task.TaskRepository import com.netflix.spinnaker.clouddriver.aws.TestCredential import com.netflix.spinnaker.clouddriver.aws.services.AsgService import com.netflix.spinnaker.clouddriver.aws.services.RegionScopedProviderFactory +import com.netflix.spinnaker.clouddriver.data.task.DefaultTask +import com.netflix.spinnaker.clouddriver.data.task.Task +import com.netflix.spinnaker.clouddriver.data.task.TaskRepository import com.netflix.spinnaker.clouddriver.model.Cluster import com.netflix.spinnaker.clouddriver.model.ClusterProvider import com.netflix.spinnaker.clouddriver.model.ServerGroup @@ -33,6 +35,9 @@ import org.springframework.beans.factory.annotation.Autowired import spock.lang.Specification import spock.lang.Unroll +import java.time.Instant +import java.time.temporal.ChronoUnit + class AutoScalingWorkerUnitSpec extends Specification { @Autowired @@ -173,6 +178,106 @@ class AutoScalingWorkerUnitSpec extends Specification { 1 * autoScaling.enableMetricsCollection({ it.metrics == ['GroupMinSize', 'GroupMaxSize'] }) } + void "continues if serverGroup already exists, is reasonably the same and within safety window"() { + setup: + def autoScalingWorker = new AutoScalingWorker( + enabledMetrics: ['GroupMinSize', 'GroupMaxSize'], + instanceMonitoring: true, + regionScopedProvider: regionScopedProvider, + credentials: credential, + application: "myasg", + region: "us-east-1", + classicLoadBalancers: ["one", "two"] + ) + + when: + String asgName = autoScalingWorker.deploy() + + then: + noExceptionThrown() + 1 * lcBuilder.buildLaunchConfiguration('myasg', null, _, null) >> "myasg-12345" + 1 * autoScaling.createAutoScalingGroup(_) >> { throw new AlreadyExistsException("Already exists, man") } + 1 * autoScaling.describeAutoScalingGroups(_) >> { + new DescribeAutoScalingGroupsResult( + autoScalingGroups: [ + new AutoScalingGroup( + autoScalingGroupName: "myasg-v000", + launchConfigurationName: "myasg-12345", + loadBalancerNames: ["one", "two"], + createdTime: new Date() + ) + ] + ) + } + asgName == "myasg-v000" + } + + void "throws duplicate exception if existing autoscaling group was created before safety window"() { + setup: + def autoScalingWorker = new AutoScalingWorker( + enabledMetrics: ['GroupMinSize', 'GroupMaxSize'], + instanceMonitoring: true, + regionScopedProvider: regionScopedProvider, + credentials: credential, + application: "myasg", + region: "us-east-1", + classicLoadBalancers: ["one", "two"] + ) + + when: + autoScalingWorker.deploy() + + then: + thrown(AlreadyExistsException) + 1 * lcBuilder.buildLaunchConfiguration('myasg', null, _, null) >> "myasg-12345" + _ * autoScaling.createAutoScalingGroup(_) >> { throw new AlreadyExistsException("Already exists, man") } + _ * autoScaling.describeAutoScalingGroups(_) >> { + new DescribeAutoScalingGroupsResult( + autoScalingGroups: [ + new AutoScalingGroup( + autoScalingGroupName: "myasg-v000", + launchConfigurationName: "myasg-12345", + loadBalancerNames: ["one", "two"], + createdTime: new Date(Instant.now().minus(3, ChronoUnit.HOURS).toEpochMilli()) + ) + ] + ) + } + } + + void "throws duplicate exception if existing and desired autoscaling group differ settings"() { + setup: + def autoScalingWorker = new AutoScalingWorker( + enabledMetrics: ['GroupMinSize', 'GroupMaxSize'], + instanceMonitoring: true, + regionScopedProvider: regionScopedProvider, + credentials: credential, + application: "myasg", + region: "us-east-1", + classicLoadBalancers: ["one", "two"] + ) + + when: + autoScalingWorker.deploy() + + then: + thrown(AlreadyExistsException) + 1 * lcBuilder.buildLaunchConfiguration('myasg', null, _, null) >> "myasg-12345" + _ * autoScaling.createAutoScalingGroup(_) >> { throw new AlreadyExistsException("Already exists, man") } + _ * autoScaling.describeAutoScalingGroups(_) >> { + new DescribeAutoScalingGroupsResult( + autoScalingGroups: [ + new AutoScalingGroup( + autoScalingGroupName: "myasg-v000", + launchConfigurationName: "different", + loadBalancerNames: ["three"], + createdTime: new Date() + ) + ] + ) + } + } + @Unroll void "should validate provided subnet ids against those available for subnet type"() { given: