Skip to content
Permalink
Browse files

Show rule evaluation errors on rules page (#4457)

* adding information about the health and errors for Rules

adding Health() and LastError() to the Rule interface. This will allow
us to easily surface information about rules.

Signed-off-by: noqcks <benny@noqcks.io>

* updating rules.html with fields for Rule errors and health state

Signed-off-by: noqcks <benny@noqcks.io>

* fix code comment grammar & access Rule health/error info using a mutex

Signed-off-by: noqcks <benny@noqcks.io>

* s/Errors/Error/ in rules.html to remain consistent with targets.html

Signed-off-by: noqcks <benny@noqcks.io>

* adding periods to code comments in reporting/alerting

Signed-off-by: noqcks <benny@noqcks.io>

* putting health/error below mutex in struct field

Signed-off-by: noqcks <benny@noqcks.io>
  • Loading branch information...
noqcks authored and juliusv committed Aug 6, 2018
1 parent 2b8fc06 commit 8bb6e0dd6e80ab2b9719d2e2e6a762f9f29575d6
Showing with 168 additions and 56 deletions.
  1. +37 −1 rules/alerting.go
  2. +18 −1 rules/manager.go
  3. +42 −5 rules/recording.go
  4. +46 −46 web/ui/bindata.go
  5. +13 −1 web/ui/templates/rules.html
  6. +1 −1 web/ui/templates/targets.html
  7. +11 −1 web/web.go
@@ -108,9 +108,12 @@ type AlertingRule struct {
// true if old state has been restored. We start persisting samples for ALERT_FOR_STATE
// only after the restoration.
restored bool

// Protects the below.
mtx sync.Mutex
// The health of the alerting rule.
health RuleHealth
// The last error seen by the alerting rule.
lastError error
// A map of alerts which are currently active (Pending or Firing), keyed by
// the fingerprint of the labelset they correspond to.
active map[uint64]*Alert
@@ -126,6 +129,7 @@ func NewAlertingRule(name string, vec promql.Expr, hold time.Duration, lbls, ann
holdDuration: hold,
labels: lbls,
annotations: anns,
health: HealthUnknown,
active: map[uint64]*Alert{},
logger: logger,
restored: restored,
@@ -137,6 +141,34 @@ func (r *AlertingRule) Name() string {
return r.name
}

// SetLastError sets the current error seen by the alerting rule.
func (r *AlertingRule) SetLastError(err error) {
r.mtx.Lock()
defer r.mtx.Unlock()
r.lastError = err
}

// LastError returns the last error seen by the alerting rule.
func (r *AlertingRule) LastError() error {
r.mtx.Lock()
defer r.mtx.Unlock()
return r.lastError
}

// SetHealth sets the current health of the alerting rule.
func (r *AlertingRule) SetHealth(health RuleHealth) {
r.mtx.Lock()
defer r.mtx.Unlock()
r.health = health
}

// Health returns the current health of the alerting rule.
func (r *AlertingRule) Health() RuleHealth {
r.mtx.Lock()
defer r.mtx.Unlock()
return r.health
}

// Query returns the query expression of the alerting rule.
func (r *AlertingRule) Query() promql.Expr {
return r.vector
@@ -225,6 +257,8 @@ const resolvedRetention = 15 * time.Minute
func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, externalURL *url.URL) (promql.Vector, error) {
res, err := query(ctx, r.vector.String(), ts)
if err != nil {
r.SetHealth(HealthBad)
r.SetLastError(err)
return nil, err
}

@@ -330,6 +364,8 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
}
}

r.SetHealth(HealthGood)
r.SetLastError(err)
return vec, nil
}

@@ -39,6 +39,16 @@ import (
"github.com/prometheus/prometheus/storage"
)

// RuleHealth describes the health state of a target.
type RuleHealth string

// The possible health states of a rule based on the last execution.
const (
HealthUnknown RuleHealth = "unknown"
HealthGood RuleHealth = "ok"
HealthBad RuleHealth = "err"
)

// Constants for instrumentation.
const namespace = "prometheus"

@@ -140,7 +150,14 @@ type Rule interface {
Eval(context.Context, time.Time, QueryFunc, *url.URL) (promql.Vector, error)
// String returns a human-readable string representation of the rule.
String() string

// SetLastErr sets the current error experienced by the rule.
SetLastError(error)
// LastErr returns the last error experienced by the rule.
LastError() error
// SetHealth sets the current health of the rule.
SetHealth(RuleHealth)
// Health returns the current health of the rule.
Health() RuleHealth
SetEvaluationDuration(time.Duration)
GetEvaluationDuration() time.Duration
// HTMLSnippet returns a human-readable string representation of the rule,
@@ -31,10 +31,15 @@ import (

// A RecordingRule records its vector expression into new timeseries.
type RecordingRule struct {
name string
vector promql.Expr
labels labels.Labels
mtx sync.Mutex
name string
vector promql.Expr
labels labels.Labels
// Protects the below.
mtx sync.Mutex
// The health of the recording rule.
health RuleHealth
// The last error seen by the recording rule.
lastError error
evaluationDuration time.Duration
}

@@ -43,6 +48,7 @@ func NewRecordingRule(name string, vector promql.Expr, lset labels.Labels) *Reco
return &RecordingRule{
name: name,
vector: vector,
health: HealthUnknown,
labels: lset,
}
}
@@ -66,6 +72,8 @@ func (rule *RecordingRule) Labels() labels.Labels {
func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, _ *url.URL) (promql.Vector, error) {
vector, err := query(ctx, rule.vector.String(), ts)
if err != nil {
rule.SetHealth(HealthBad)
rule.SetLastError(err)
return nil, err
}
// Override the metric name and labels.
@@ -86,7 +94,8 @@ func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFu

sample.Metric = lb.Labels()
}

rule.SetHealth(HealthGood)
rule.SetLastError(err)
return vector, nil
}

@@ -112,6 +121,34 @@ func (rule *RecordingRule) SetEvaluationDuration(dur time.Duration) {
rule.evaluationDuration = dur
}

// SetLastError sets the current error seen by the recording rule.
func (rule *RecordingRule) SetLastError(err error) {
rule.mtx.Lock()
defer rule.mtx.Unlock()
rule.lastError = err
}

// LastError returns the last error seen by the recording rule.
func (rule *RecordingRule) LastError() error {
rule.mtx.Lock()
defer rule.mtx.Unlock()
return rule.lastError
}

// SetHealth sets the current health of the recording rule.
func (rule *RecordingRule) SetHealth(health RuleHealth) {
rule.mtx.Lock()
defer rule.mtx.Unlock()
rule.health = health
}

// Health returns the current health of the recording rule.
func (rule *RecordingRule) Health() RuleHealth {
rule.mtx.Lock()
defer rule.mtx.Unlock()
return rule.health
}

// GetEvaluationDuration returns the time in seconds it took to evaluate the recording rule.
func (rule *RecordingRule) GetEvaluationDuration() time.Duration {
rule.mtx.Lock()

0 comments on commit 8bb6e0d

Please sign in to comment.
You can’t perform that action at this time.