Skip to content

Commit

Permalink
chore(monitoring): Log and emit a metric when a retried kato task is …
Browse files Browse the repository at this point in the history
…ultimately unsuccessful (#3895)
  • Loading branch information
jonsie committed Sep 4, 2020
1 parent 7006fb4 commit 826e293
Showing 1 changed file with 23 additions and 0 deletions.
Expand Up @@ -37,6 +37,7 @@ import org.springframework.beans.factory.annotation.Autowired
import org.springframework.stereotype.Component
import retrofit.RetrofitError

import javax.annotation.Nonnull
import java.time.Clock
import java.time.Duration
import java.util.concurrent.TimeUnit
Expand All @@ -61,6 +62,7 @@ class MonitorKatoTask implements RetryableTask, CloudProviderAware {
this(katoService, registry, Clock.systemUTC(), dynamicConfigService, retrySupport)
}

@VisibleForTesting
MonitorKatoTask(KatoService katoService, Registry registry, Clock clock, DynamicConfigService dynamicConfigService, RetrySupport retrySupport) {
this.registry = registry
this.clock = clock
Expand All @@ -81,6 +83,12 @@ class MonitorKatoTask implements RetryableTask, CloudProviderAware {
return backoffPeriod
}

@Override
TaskResult onTimeout(@Nonnull StageExecution stage) {
monitorFinalTerminalRetry(stage, "timeout")
return null
}

@Override
TaskResult execute(StageExecution stage) {
TaskId taskId = stage.context."kato.last.task.id" as TaskId
Expand Down Expand Up @@ -181,6 +189,7 @@ class MonitorKatoTask implements RetryableTask, CloudProviderAware {
if (e instanceof RetrofitError) {
RetrofitError retrofitError = (RetrofitError) e
if (retrofitError?.response?.status == 404) {
monitorFinalTerminalRetry(stage, "404")
// unexpected -- no sense attempting to resume a saga that `clouddriver` has no knowledge about
throw e
}
Expand Down Expand Up @@ -224,6 +233,20 @@ class MonitorKatoTask implements RetryableTask, CloudProviderAware {
}
}

/**
* Log and emits a metric when a kato task retry is finally terminal - typically either from
* the task timing out or from an unexpected error, like a 404, when attempting to retry.
*/
private void monitorFinalTerminalRetry(StageExecution stage, String reason) {
if (stage.context."kato.task.retriedOperation" == true) {
TaskId taskId = stage.context."kato.last.task.id" as TaskId
Integer totalRetries = stage.context."kato.task.terminalRetryCount" as Integer
log.warn("Failed retrying kato task '{}' (retries: '{}') due to reason: '{}'", taskId.id,
totalRetries, reason)
registry.counter("monitorKatoTask.terminalRetry", "reason", reason).increment()
}
}

/**
* @param The task being inspected for region/server group mappings.
* @return Server group names keyed by region.
Expand Down

0 comments on commit 826e293

Please sign in to comment.