From 85f5b6314bd6521268f6e89d9363c6078470c88d Mon Sep 17 00:00:00 2001 From: MasterPtato Date: Tue, 8 Jul 2025 22:12:01 +0000 Subject: [PATCH] fix: gracefully handle prom failure for pb topo --- .../cluster/src/ops/datacenter/topology_get/mod.rs | 4 ++-- .../src/ops/datacenter/topology_get/pegboard.rs | 12 +++++++++++- .../cluster/src/workflows/datacenter/scale.rs | 8 ++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/packages/core/services/cluster/src/ops/datacenter/topology_get/mod.rs b/packages/core/services/cluster/src/ops/datacenter/topology_get/mod.rs index dcb9fcc2c8..8d146b96ef 100644 --- a/packages/core/services/cluster/src/ops/datacenter/topology_get/mod.rs +++ b/packages/core/services/cluster/src/ops/datacenter/topology_get/mod.rs @@ -486,7 +486,7 @@ async fn fetch_server_metrics( }} [5m] ) - [3h] + [3h:] ) ) ) @@ -518,7 +518,7 @@ async fn fetch_server_metrics( device=~"(eth0|eth1)" }}[1m] ) - ) [3h]) + ) [3h:]) ) ) # Convert from B/s to Kb/s diff --git a/packages/core/services/cluster/src/ops/datacenter/topology_get/pegboard.rs b/packages/core/services/cluster/src/ops/datacenter/topology_get/pegboard.rs index 014790c05d..ae883f80a0 100644 --- a/packages/core/services/cluster/src/ops/datacenter/topology_get/pegboard.rs +++ b/packages/core/services/cluster/src/ops/datacenter/topology_get/pegboard.rs @@ -58,7 +58,17 @@ pub async fn pegboard_client_usage_get(ctx: &OperationCtx, input: &Input) -> Glo ) .to_string(), ) - .await?; + .await; + + // Gracefully handle prometheus failure, fallback to no data + let prom_res = match prom_res { + Ok(x) => x, + Err(err) => { + tracing::error!(?err, "failed to fetch pegboard prometheus metrics"); + + Default::default() + }, + }; let mut stats_by_client_id = HashMap::new(); diff --git a/packages/core/services/cluster/src/workflows/datacenter/scale.rs b/packages/core/services/cluster/src/workflows/datacenter/scale.rs index 75d4f69b1f..e09ba09243 100644 --- a/packages/core/services/cluster/src/workflows/datacenter/scale.rs +++ b/packages/core/services/cluster/src/workflows/datacenter/scale.rs @@ -207,6 +207,14 @@ async fn calculate_diff( }), )?; + if !topology_res.prometheus_fetched { + tracing::error!("prometheus could not be fetched, not scaling"); + + return Ok(CalculateDiffOutput { + actions: Vec::new(), + }); + } + let dc = unwrap!(datacenter_res.datacenters.first()); // Build hashmap from topos for sorting