diff --git a/CHANGELOG.md b/CHANGELOG.md index f9d1e4d..6bd966e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Changed + +- Reduce severity of Pod eviction errors. Previously, the operator would produce lot's of + `Cannot evict pod as it would violate the pod's disruption budget` errors. With this fix, the + error is reduced to an info instead ([#372]). + +[#372]: https://github.com/stackabletech/commons-operator/pull/372 + ## [25.7.0] - 2025-07-23 ## [25.7.0-rc1] - 2025-07-18 diff --git a/Cargo.lock b/Cargo.lock index 4c70a7f..7ec55f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2566,6 +2566,7 @@ dependencies = [ "built", "clap", "futures 0.3.31", + "http", "serde", "serde_json", "snafu 0.8.7", diff --git a/Cargo.nix b/Cargo.nix index 91feb07..ddd4a83 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -8344,6 +8344,10 @@ rec { packageId = "futures 0.3.31"; features = [ "compat" ]; } + { + name = "http"; + packageId = "http"; + } { name = "serde"; packageId = "serde"; diff --git a/Cargo.toml b/Cargo.toml index 8b5d202..cf91d22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ anyhow = "1.0" built = { version = "0.8", features = ["chrono", "git2"] } clap = "4.5" futures = { version = "0.3", features = ["compat"] } +http = "1.3" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" snafu = "0.8" diff --git a/rust/operator-binary/Cargo.toml b/rust/operator-binary/Cargo.toml index 78bad2c..f4f0508 100644 --- a/rust/operator-binary/Cargo.toml +++ b/rust/operator-binary/Cargo.toml @@ -13,6 +13,7 @@ stackable-operator.workspace = true anyhow.workspace = true clap.workspace = true +http.workspace = true futures.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/rust/operator-binary/src/restart_controller/pod.rs b/rust/operator-binary/src/restart_controller/pod.rs index 59f5a97..08818a3 100644 --- a/rust/operator-binary/src/restart_controller/pod.rs +++ b/rust/operator-binary/src/restart_controller/pod.rs @@ -1,6 +1,7 @@ use std::{sync::Arc, time::Duration}; use futures::StreamExt; +use http::StatusCode; use snafu::{OptionExt, ResultExt, Snafu}; use stackable_operator::{ client::Client, @@ -11,10 +12,10 @@ use stackable_operator::{ kube::{ self, api::{EvictParams, PartialObjectMeta}, - core::DynamicObject, + core::{DynamicObject, ErrorResponse}, runtime::{ Controller, - controller::Action, + controller::{self, Action}, events::{Recorder, Reporter}, reflector::ObjectRef, watcher, @@ -96,10 +97,7 @@ pub async fn start(client: &Client, watch_namespace: &WatchNamespace) { // The event_recorder needs to be shared across all invocations, so that // events are correctly aggregated let event_recorder = event_recorder.clone(); - async move { - report_controller_reconciled(&event_recorder, FULL_CONTROLLER_NAME, &result) - .await; - } + async move { report_result(result, event_recorder).await } }, ) .await; @@ -192,6 +190,58 @@ async fn reconcile(pod: Arc>, ctx: Arc) -> Result>, Action), + controller::Error, + >, + event_recorder: Arc, +) { + if let Err(controller::Error::ReconcilerFailed( + Error::EvictPod { + source: evict_pod_error, + }, + pod_ref, + )) = &result + { + const TOO_MANY_REQUESTS_HTTP_CODE: u16 = StatusCode::TOO_MANY_REQUESTS.as_u16(); + // We can not blanket silence all 429 responses, as it could be something else. + // E.g. I have seen "storage is re-initializing" in the past. + const EVICT_ERROR_MESSAGE: &str = + "Cannot evict pod as it would violate the pod's disruption budget."; + + if let kube::Error::Api(ErrorResponse { + code: TOO_MANY_REQUESTS_HTTP_CODE, + message: error_message, + .. + }) = evict_pod_error + // TODO: We need Rust 1.88 and 2024 edition for if-let-chains + // && error_message == EVICT_ERROR_MESSAGE + { + if error_message == EVICT_ERROR_MESSAGE { + tracing::info!( + k8s.object.ref = %pod_ref, + error = %evict_pod_error, + "Tried to evict Pod, but wasn't allowed to do so, as it would violate the Pod's disruption budget. Retrying later" + ); + return; + } + } + } + + report_controller_reconciled(&event_recorder, FULL_CONTROLLER_NAME, &result).await; +} + fn error_policy(_obj: Arc>, _error: &Error, _ctx: Arc) -> Action { Action::requeue(Duration::from_secs(5)) }