Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
85 lines (75 sloc) 3.45 KB
# committed messages/s, regardless of the node on which they were committed
job:committed:rate5m_sum = sum(rate(raftmetrics_raft_commitTime_count[5m])) by (job)
# Same metric, but with 30s resolution for loadtests.
job:committed:rate30s_sum = sum(rate(raftmetrics_raft_commitTime_count[30s])) by (job)
# TODO(secure): does this time series include s2s connections? maybe introduce a better one
job:sessions:min = min(irc_sessions) by (job)
# Instead of having multiple building blocks, we need to cram all of these into
# a single rule because prometheus executes rules in random order, see also
# https://groups.google.com/d/msg/prometheus-developers/wfGhGkDcZ3Q/yoguDheN-uAJ
# and https://github.com/prometheus/prometheus/issues/17
#instance:seconds_in_state:rate = rate(seconds_in_state[1m])
#job_instance:available_secs:sum_rate = sum(instance:seconds_in_state:rate{state=~"Leader|Follower"}) by (job, instance)
#job_instance:total_secs:sum_rate = sum(instance:seconds_in_state:rate) by (job, instance)
#job_instance:availability:sum_rate = job_instance:available_secs:sum_rate / job:total_secs:sum_rate
job_instance:availability:sum_rate = sum(rate(seconds_in_state{state=~"Leader|Follower"}[1m])) by (job, instance) / sum(rate(seconds_in_state[1m])) by (job, instance)
job:availability:topk_sum_rate = sum(topk(2, job_instance:availability:sum_rate)) by (job)
# job:leader_flaps indicates whether there were any leader changes in the last
# 10m, but only if the entire network has been running for at least 5m (so that
# it does not fire during updates).
job:leader_flaps_stable:sum_deriv10m = (sum(abs(deriv(raft_isleader{job="robustirc"}[10m]))) by (job)) * (count_scalar((time() - process_start_time_seconds{job="robustirc"}) > (5*60)) >= bool 3)
job:leader_flaps:sum_deriv10m = sum(abs(deriv(raft_isleader{job="robustirc"}[10m]))) by (job)
ALERT NetworkUnavailable
IF (job:availability:topk_sum_rate < 2)
FOR 1m
LABELS {
job = "robustirc",
}
ANNOTATIONS {
summary = "Not even 2 nodes are Leader/Follower",
description = "Only {{$value}} nodes are in Leader or Follower raft state. The network needs at least 2.",
}
ALERT CapacityNotNPlusOne
IF (count(up{job="robustirc"} == 1) < 3)
FOR 30m
LABELS {
job = "robustirc",
}
ANNOTATIONS {
summary = "Capacity less than n+1",
description = "Only {{$value}} of 3 nodes are up. Please replace the faulty nodes.",
}
# With just one connection, there will be at least one PING message every 60s.
# If your network does not have even one permanent connection, you must use a
# prober such as http://kanla.zekjur.net to make sure there are messages.
ALERT CommitRateTooLow
IF ((60 * job:committed:rate5m_sum{job="robustirc"}) < 1)
FOR 5m
LABELS {
job = "robustirc",
}
ANNOTATIONS {
summary = "Network is not committing messages",
description = "Commit rate is only {{$value}} messages per minute.",
}
# TODO: NetworkUnavailable covers this. Once NetworkUnavailable fires, remove NoLeader.
ALERT NoLeader
IF (max(raft_isleader{job="robustirc"}) < 1)
FOR 5m
LABELS {
job = "robustirc",
}
ANNOTATIONS {
summary = "No raft leader elected",
description = "Without a raft leader, the network cannot make progress.",
}
ALERT LeaderFlapping
IF (job:leader_flaps_stable:sum_deriv10m > 0)
FOR 60m
LABELS {
job = "robustirc",
}
ANNOTATIONS {
summary = "Raft leader flapping",
description = "The network is switching back and forth between leaders. Check timeouts?",
}