Skip to content
This repository has been archived by the owner on Nov 14, 2020. It is now read-only.

Commit

Permalink
Revert rabbitmq_raft_entry_commit_latency to gauge & fix conversion
Browse files Browse the repository at this point in the history
It behaves as a gauge when metrics are not aggregated, which is not what
we want. It's either a histogram, or it's a gauge, but it doesn't change
type based on whether we are aggregating metrics or not. To be honest, I
rushed #28
acceptance and didn't check this metric type propertly. Hoping to
pair-up with @dcorbacho on this and getting the histogram back. For now,
let's keep it a gauge and go forward with the metric aggregation
back-port into v3.8.x.

Because the change from millis to micros was a breaking change in
rabbitmq/ra#160, it was reverted, so we had to
fix (missed the undefined in one of the merges - whoops) and revert to
millis.

Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
  • Loading branch information
gerhard committed Feb 5, 2020
1 parent 413d26d commit 5caa419
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 25 deletions.
35 changes: 13 additions & 22 deletions src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
Expand Up @@ -57,6 +57,10 @@
% Some metrics require to be converted, mostly those that represent time.
% It is a Prometheus best practice to use specific base units: https://prometheus.io/docs/practices/naming/#base-units
% Extra context: https://github.com/prometheus/docs/pull/1414#issuecomment-522337895

-define(MILLISECOND, 1000).
-define(MICROSECOND, 1000000).

-define(METRICS_RAW, [
{channel_metrics, [
{2, undefined, channel_consumers, gauge, "Consumers on a channel", consumer_count},
Expand Down Expand Up @@ -135,7 +139,7 @@
{2, undefined, erlang_processes_limit, gauge, "Erlang processes limit", proc_total},
{2, undefined, erlang_scheduler_run_queue, gauge, "Erlang scheduler run queue", run_queue},
{2, undefined, erlang_net_ticktime_seconds, gauge, "Inter-node heartbeat interval", net_ticktime},
{2, 1000, erlang_uptime_seconds, gauge, "Node uptime", uptime}
{2, ?MILLISECOND, erlang_uptime_seconds, gauge, "Node uptime", uptime}
]},

{node_persister_metrics, [
Expand All @@ -154,11 +158,11 @@
{2, undefined, queue_index_read_ops_total, counter, "Total number of Queue Index read operations", queue_index_read_count},
{2, undefined, queue_index_write_ops_total, counter, "Total number of Queue Index write operations", queue_index_write_count},
{2, undefined, queue_index_journal_write_ops_total, counter, "Total number of Queue Index Journal write operations", queue_index_journal_write_count},
{2, 1000000, io_read_time_seconds_total, counter, "Total I/O read time", io_read_time},
{2, 1000000, io_write_time_seconds_total, counter, "Total I/O write time", io_write_time},
{2, 1000000, io_sync_time_seconds_total, counter, "Total I/O sync time", io_sync_time},
{2, 1000000, io_seek_time_seconds_total, counter, "Total I/O seek time", io_seek_time},
{2, 1000000, io_open_attempt_time_seconds_total, counter, "Total file open attempts time", io_file_handle_open_attempt_time}
{2, ?MICROSECOND, io_read_time_seconds_total, counter, "Total I/O read time", io_read_time},
{2, ?MICROSECOND, io_write_time_seconds_total, counter, "Total I/O write time", io_write_time},
{2, ?MICROSECOND, io_sync_time_seconds_total, counter, "Total I/O sync time", io_sync_time},
{2, ?MICROSECOND, io_seek_time_seconds_total, counter, "Total I/O seek time", io_seek_time},
{2, ?MICROSECOND, io_open_attempt_time_seconds_total, counter, "Total file open attempts time", io_file_handle_open_attempt_time}
]},

{ra_metrics, [
Expand All @@ -167,7 +171,7 @@
{4, undefined, raft_log_last_applied_index, gauge, "Raft log last applied index"},
{5, undefined, raft_log_commit_index, gauge, "Raft log commit index"},
{6, undefined, raft_log_last_written_index, gauge, "Raft log last written index"},
{7, raft_entry_commit_latency, gauge, "Time taken for an entry to be committed"}
{7, ?MILLISECOND, raft_entry_commit_latency_seconds, gauge, "Time taken for an entry to be committed"}
]},

{queue_coarse_metrics, [
Expand Down Expand Up @@ -458,27 +462,22 @@ get_data(Table, true) when Table == channel_exchange_metrics;
({_, V1, V2, V3, V4}, {T, A1, A2, A3, A4}) ->
{T, V1 + A1, V2 + A2, V3 + A3, V4 + A4};
({_, V1, V2, V3, V4, V5, V6}, {T, A1, A2, A3, A4, A5, A6}) ->
%% ra_metrics: latency is a histogram.
{T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, update_histogram(V6, A6)};
{T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, V6 + A6};
({_, V1, V2, V3, V4, V5, V6, V7, _}, {T, A1, A2, A3, A4, A5, A6, A7}) ->
{T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, V6 + A6, V7 + A7}
end, empty(Table), Table)];

get_data(Table, _) ->
ets:tab2list(Table).

update_histogram(Value, {H, C, S}) ->
NH = maps:update_with(position(?BUCKETS, Value), fun(V) -> V + 1 end, H),
{NH, C + 1, S + Value}.

empty(T) when T == channel_queue_exchange_metrics; T == channel_process_metrics ->
{T, 0};
empty(T) when T == connection_coarse_metrics ->
{T, 0, 0, 0};
empty(T) when T == channel_exchange_metrics; T == queue_coarse_metrics; T == connection_metrics ->
{T, 0, 0, 0, 0};
empty(T) when T == ra_metrics ->
{T, 0, 0, 0, 0, 0, empty_histogram(?BUCKETS)};
{T, 0, 0, 0, 0, 0, 0};
empty(T) when T == channel_queue_metrics; T == channel_metrics ->
{T, 0, 0, 0, 0, 0, 0, 0};
empty(queue_metrics = T) ->
Expand All @@ -490,11 +489,3 @@ sum('', B) ->
B;
sum(A, B) ->
A + B.

empty_histogram(Buckets) ->
{maps:from_list([{B, 0} || B <- Buckets]), 0, 0}.

position([Bound | L], Value) when Value > Bound ->
position(L, Value);
position([Bound | _], _) ->
Bound.
4 changes: 1 addition & 3 deletions test/rabbit_prometheus_http_SUITE.erl
Expand Up @@ -228,9 +228,7 @@ aggregated_metrics_test(Config) ->
?assertEqual(match, re:run(Body, "^rabbitmq_process_max_fds ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_io_read_ops_total ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_term_total ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds_count ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds_sum ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds_bucket{", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_queue_messages_ready ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_queue_consumers ", [{capture, none}, multiline])),
%% Checking the first metric value in each ETS table that requires converting
Expand Down

0 comments on commit 5caa419

Please sign in to comment.