Skip to content

Commit

Permalink
feat(dynamicBackoff): Dynamically configurable task backoff (#3228)
Browse files Browse the repository at this point in the history
Task backoff can be configured globally, or drilled down to the cloudProvider or account level.  The relevant max backoff time always wins - if you desire no change of behavior simply do not create a dynamic config entry for a back off.  Note that the max back off time of 2 minutes can not be overridden.
  • Loading branch information
jonsie authored and marchello2000 committed Oct 14, 2019
1 parent 1728606 commit 42a5665
Show file tree
Hide file tree
Showing 5 changed files with 258 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import com.netflix.spinnaker.orca.TaskResult
import com.netflix.spinnaker.orca.clouddriver.KatoService
import com.netflix.spinnaker.orca.clouddriver.model.Task
import com.netflix.spinnaker.orca.clouddriver.model.TaskId
import com.netflix.spinnaker.orca.clouddriver.utils.CloudProviderAware
import com.netflix.spinnaker.orca.pipeline.model.Stage
import com.netflix.spinnaker.orca.pipeline.model.SystemNotification
import groovy.transform.CompileStatic
Expand All @@ -38,7 +39,7 @@ import java.time.Clock
@Slf4j
@Component
@CompileStatic
class MonitorKatoTask implements RetryableTask {
class MonitorKatoTask implements RetryableTask, CloudProviderAware {

/**
* How long to continue trying to look up a task that reports a 404 Not Found.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -32,18 +34,22 @@ default String getDefaultCloudProvider() {
return DEFAULT_CLOUD_PROVIDER;
}

@Nullable
default String getCloudProvider(Stage stage) {
return getCloudProvider(stage.getContext());
}

@Nullable
default String getCloudProvider(Map<String, Object> context) {
return (String) context.getOrDefault("cloudProvider", getDefaultCloudProvider());
}

@Nullable
default String getCredentials(Stage stage) {
return getCredentials(stage.getContext());
}

@Nullable
default String getCredentials(Map<String, Object> context) {
return (String)
context.getOrDefault(
Expand Down Expand Up @@ -76,4 +82,12 @@ default List<String> getRegions(Map<String, Object> context) {
default List<String> getRegions(Stage stage) {
return getRegions(stage.getContext());
}

default boolean hasCloudProvider(@Nonnull Stage stage) {
return getCloudProvider(stage) != null;
}

default boolean hasCredentials(@Nonnull Stage stage) {
return getCredentials(stage) != null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ package com.netflix.spinnaker.orca.q
import com.netflix.spinnaker.orca.OverridableTimeoutRetryableTask
import com.netflix.spinnaker.orca.RetryableTask
import com.netflix.spinnaker.orca.Task
import com.netflix.spinnaker.orca.clouddriver.utils.CloudProviderAware

interface DummyTask : RetryableTask
interface DummyCloudProviderAwareTask : RetryableTask, CloudProviderAware
interface InvalidTask : Task
interface DummyTimeoutOverrideTask : OverridableTimeoutRetryableTask
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package com.netflix.spinnaker.orca.q.handler

import com.netflix.spectator.api.BasicTag
import com.netflix.spectator.api.Registry
import com.netflix.spinnaker.kork.dynamicconfig.DynamicConfigService
import com.netflix.spinnaker.orca.ExecutionStatus
import com.netflix.spinnaker.orca.ExecutionStatus.CANCELED
import com.netflix.spinnaker.orca.ExecutionStatus.FAILED_CONTINUE
Expand All @@ -33,6 +34,7 @@ import com.netflix.spinnaker.orca.RetryableTask
import com.netflix.spinnaker.orca.Task
import com.netflix.spinnaker.orca.TaskExecutionInterceptor
import com.netflix.spinnaker.orca.TaskResult
import com.netflix.spinnaker.orca.clouddriver.utils.CloudProviderAware
import com.netflix.spinnaker.orca.exceptions.ExceptionHandler
import com.netflix.spinnaker.orca.exceptions.TimeoutException
import com.netflix.spinnaker.orca.ext.beforeStages
Expand Down Expand Up @@ -75,7 +77,8 @@ class RunTaskHandler(
private val clock: Clock,
private val exceptionHandlers: List<ExceptionHandler>,
private val taskExecutionInterceptors: List<TaskExecutionInterceptor>,
private val registry: Registry
private val registry: Registry,
private val dynamicConfigService: DynamicConfigService
) : OrcaMessageHandler<RunTask>, ExpressionAware, AuthenticationAware {

override fun handle(message: RunTask) {
Expand Down Expand Up @@ -170,12 +173,6 @@ class RunTaskHandler(
}
}

private fun maxBackoff(): Long =
taskExecutionInterceptors.fold(Long.MAX_VALUE) {
backoff, interceptor ->
Math.min(backoff, interceptor.maxTaskBackoff())
}

private fun trackResult(stage: Stage, thisInvocationStartTimeMs: Long, taskModel: com.netflix.spinnaker.orca.pipeline.model.Task, status: ExecutionStatus) {
val commonTags = MetricsTagHelper.commonTags(stage, taskModel, status)
val detailedTags = MetricsTagHelper.detailedTaskTags(stage, taskModel, status)
Expand Down Expand Up @@ -209,10 +206,58 @@ class RunTaskHandler(
private fun Task.backoffPeriod(taskModel: com.netflix.spinnaker.orca.pipeline.model.Task, stage: Stage): TemporalAmount =
when (this) {
is RetryableTask -> Duration.ofMillis(
Math.min(getDynamicBackoffPeriod(stage, Duration.ofMillis(System.currentTimeMillis() - (taskModel.startTime
?: 0))), maxBackoff())
retryableBackOffPeriod(taskModel, stage).coerceAtMost(taskExecutionInterceptors.maxBackoff())
)
else -> Duration.ofMillis(1000)
}

/**
* The max back off value always wins. For example, given the following dynamic configs:
* `tasks.global.backOffPeriod = 5000`
* `tasks.aws.backOffPeriod = 80000`
* `tasks.aws.someAccount.backoffPeriod = 60000`
* `tasks.aws.backoffPeriod` will be used (given the criteria matches and unless the default dynamicBackOffPeriod is greater).
*/
private fun RetryableTask.retryableBackOffPeriod(
taskModel: com.netflix.spinnaker.orca.pipeline.model.Task,
stage: Stage
): Long {
val dynamicBackOffPeriod = getDynamicBackoffPeriod(
stage, Duration.ofMillis(System.currentTimeMillis() - (taskModel.startTime ?: 0))
)
val backOffs: MutableList<Long> = mutableListOf(
dynamicBackOffPeriod,
dynamicConfigService.getConfig(
Long::class.java,
"tasks.global.backOffPeriod",
dynamicBackOffPeriod)
)

if (this is CloudProviderAware && hasCloudProvider(stage)) {
backOffs.add(
dynamicConfigService.getConfig(
Long::class.java,
"tasks.${getCloudProvider(stage)}.backOffPeriod",
dynamicBackOffPeriod
)
)
else -> Duration.ofSeconds(1)
if (hasCredentials(stage)) {
backOffs.add(
dynamicConfigService.getConfig(
Long::class.java,
"tasks.${getCloudProvider(stage)}.${getCredentials(stage)}.backOffPeriod",
dynamicBackOffPeriod
)
)
}
}

return backOffs.max() ?: dynamicBackOffPeriod
}

private fun List<TaskExecutionInterceptor>.maxBackoff(): Long =
this.fold(Long.MAX_VALUE) { backoff, interceptor ->
backoff.coerceAtMost(interceptor.maxTaskBackoff())
}

private fun formatTimeout(timeout: Long): String {
Expand Down
Loading

0 comments on commit 42a5665

Please sign in to comment.