From ef2c7a5725c0226a23fbf5f9a15853de685d201b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 9 Oct 2020 13:32:47 +0100 Subject: [PATCH] [BUGFIX] Fix reporting of `cortex_prometheus_rule_group_duration_seconds` (#3310) * [BUGFIX] Fix reporting of `cortex_prometheus_rule_group_duration_seconds` metric Fixes a small bug in the ruler metrics, where we used the wrong name to match against the upstream metric. This caused the metric to not report any data at all. Signed-off-by: gotjosh * Changelog entry Signed-off-by: gotjosh --- CHANGELOG.md | 1 + pkg/ruler/manager_metrics.go | 2 +- pkg/ruler/manager_metrics_test.go | 27 +++++++++++++++++++++------ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef4bd40f61..9af5342c39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -84,6 +84,7 @@ * [BUGFIX] Experimental Alertmanager API: Do not allow empty Alertmanager configurations or bad template filenames to be submitted through the configuration API. #3185 * [BUGFIX] Reduce failures to update heartbeat when using Consul. #3259 * [BUGFIX] When using ruler sharding, moving all user rule groups from ruler to a different one and then back could end up with some user groups not being evaluated at all. #3235 +* [BUGFIX] Fixes the metric `cortex_prometheus_rule_group_duration_seconds` in the Ruler, it wouldn't report any values. #3310 ## 1.4.0 / 2020-10-02 diff --git a/pkg/ruler/manager_metrics.go b/pkg/ruler/manager_metrics.go index b888a06571..f4c3942229 100644 --- a/pkg/ruler/manager_metrics.go +++ b/pkg/ruler/manager_metrics.go @@ -148,7 +148,7 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) { // If same user is later re-added, all metrics will start from 0, which is fine. data.SendSumOfSummariesPerUser(out, m.EvalDuration, "prometheus_rule_evaluation_duration_seconds") - data.SendSumOfSummariesPerUser(out, m.IterationDuration, "cortex_prometheus_rule_group_duration_seconds") + data.SendSumOfSummariesPerUser(out, m.IterationDuration, "prometheus_rule_group_duration_seconds") data.SendSumOfCountersPerUser(out, m.IterationsMissed, "prometheus_rule_group_iterations_missed_total") data.SendSumOfCountersPerUser(out, m.IterationsScheduled, "prometheus_rule_group_iterations_total") diff --git a/pkg/ruler/manager_metrics_test.go b/pkg/ruler/manager_metrics_test.go index 2686e24235..91149f59da 100644 --- a/pkg/ruler/manager_metrics_test.go +++ b/pkg/ruler/manager_metrics_test.go @@ -61,12 +61,27 @@ cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user2"} 10 cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user3"} 100 # HELP cortex_prometheus_rule_group_duration_seconds The duration of rule group evaluations. # TYPE cortex_prometheus_rule_group_duration_seconds summary -cortex_prometheus_rule_group_duration_seconds_sum{user="user1"} 0 -cortex_prometheus_rule_group_duration_seconds_count{user="user1"} 0 -cortex_prometheus_rule_group_duration_seconds_sum{user="user2"} 0 -cortex_prometheus_rule_group_duration_seconds_count{user="user2"} 0 -cortex_prometheus_rule_group_duration_seconds_sum{user="user3"} 0 -cortex_prometheus_rule_group_duration_seconds_count{user="user3"} 0 +cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.01"} 1 +cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.05"} 1 +cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.5"} 1 +cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.9"} 1 +cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.99"} 1 +cortex_prometheus_rule_group_duration_seconds_sum{user="user1"} 1 +cortex_prometheus_rule_group_duration_seconds_count{user="user1"} 1 +cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.01"} 10 +cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.05"} 10 +cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.5"} 10 +cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.9"} 10 +cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.99"} 10 +cortex_prometheus_rule_group_duration_seconds_sum{user="user2"} 10 +cortex_prometheus_rule_group_duration_seconds_count{user="user2"} 1 +cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.01"} 100 +cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.05"} 100 +cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.5"} 100 +cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.9"} 100 +cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.99"} 100 +cortex_prometheus_rule_group_duration_seconds_sum{user="user3"} 100 +cortex_prometheus_rule_group_duration_seconds_count{user="user3"} 1 # HELP cortex_prometheus_rule_group_iterations_missed_total The total number of rule group evaluations missed due to slow rule group evaluation. # TYPE cortex_prometheus_rule_group_iterations_missed_total counter cortex_prometheus_rule_group_iterations_missed_total{user="user1"} 1