From 2e22163eb64fc46dc0186c7cbba75df84b835fad Mon Sep 17 00:00:00 2001
From: Gerhard Lazu <gerhard@lazu.co.uk>
Date: Thu, 6 Feb 2020 17:37:37 +0000
Subject: [PATCH] Replace histogram type with gauge for
 raft_entry_commit_latency_seconds

We want to keep the same metric type regardless whether we aggregate or
don't. If we had used a histogram type, considering the ~12 buckets that
we added, it would have meant 12 extra metrics per queue which would
have resulted in an explosion of metrics. Keeping the gauge type and
aggregating latencies across all members.

re https://github.com/rabbitmq/rabbitmq-prometheus/pull/28

Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
(cherry picked from commit 3a24c4a7b44e3cb4c1b60918c85052bf667a053e)
Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
---
 deps/rabbitmq_prometheus/Makefile             |  2 +-
 deps/rabbitmq_prometheus/README.md            | 17 +++++----
 deps/rabbitmq_prometheus/metrics.md           | 16 ++++-----
 .../priv/schema/rabbitmq_prometheus.schema    |  4 +++
 ...etheus_rabbitmq_core_metrics_collector.erl | 35 +++++++++----------
 .../test/rabbit_prometheus_http_SUITE.erl     |  8 ++---
 6 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/deps/rabbitmq_prometheus/Makefile b/deps/rabbitmq_prometheus/Makefile
index ce138b9517..bdfbe6e912 100644
--- a/deps/rabbitmq_prometheus/Makefile
+++ b/deps/rabbitmq_prometheus/Makefile
@@ -11,7 +11,7 @@ OTP_SHA256 := 4cf44ed12f657c309a2c00e7806f36f56a88e5b74de6814058796561f3842f66
 
 define PROJECT_ENV
 [
- {enable_metric_aggregation, false}
+	{enable_metrics_aggregation, false}
 ]
 endef
 
diff --git a/deps/rabbitmq_prometheus/README.md b/deps/rabbitmq_prometheus/README.md
index 241528945d..f0685090e5 100644
--- a/deps/rabbitmq_prometheus/README.md
+++ b/deps/rabbitmq_prometheus/README.md
@@ -1,18 +1,19 @@
-# Prometheus Exporter of Core (Raw, Unaggregated) RabbitMQ Metrics
+# Prometheus Exporter of Core RabbitMQ Metrics
 
 ## Getting Started
 
-This is a Prometheus exporter of core (raw, unaggregated) RabbitMQ metrics, developed by the RabbitMQ core team.
+This is a Prometheus exporter of core RabbitMQ metrics, developed by the RabbitMQ core team.
 It is largely a "clean room" design that reuses some prior work from Prometheus exporters done by the community.
 
 ## Project Maturity
 
-This plugin is new and relatively immature. It shipped in the RabbitMQ distribution starting with `3.8.0`.
+This plugin is new as of RabbitMQ `3.8.0`.
 
 ## Documentation
 
 See [Monitoring RabbitMQ with Prometheus and Grafana](https://www.rabbitmq.com/prometheus.html).
 
+
 ## Installation
 
 This plugin is included into RabbitMQ 3.8.x releases. Like all [plugins](https://www.rabbitmq.com/plugins.html), it has to be
@@ -22,7 +23,6 @@ To enable it with [rabbitmq-plugins](http://www.rabbitmq.com/man/rabbitmq-plugin
 
     rabbitmq-plugins enable rabbitmq_prometheus
 
-
 ## Usage
 
 See the [documentation guide](https://www.rabbitmq.com/prometheus.html).
@@ -30,13 +30,17 @@ See the [documentation guide](https://www.rabbitmq.com/prometheus.html).
 Default port used by the plugin is `15692`. In most environments there would be no configuration
 necessary.
 
-See the entire list of [metrics](metrics.md) exposed via the default port. 
+See the entire list of [metrics](metrics.md) exposed via the default port.
+
 
 ## Configuration
 
 This exporter supports the following options via a set of `prometheus.*` configuration keys:
 
- * `prometheus.path` defines a scrape endpoint. Default is `"/metrics"`.
+ * `prometheus.enable_metrics_aggregation` returns all metrics aggregated (default is `false`).
+    Nodes with over 50k objects (queues, connections, channels) can take 30 seconds or more to return metrics without this option.
+    See #26 for more details.
+ * `prometheus.path` defines a scrape endpoint (default is `"/metrics"`).
  * `prometheus.tcp.*` controls HTTP listener settings that match [those used by the RabbitMQ HTTP API](https://www.rabbitmq.com/management.html#configuration)
  * `prometheus.ssl.*` controls TLS (HTTPS) listener settings that match [those used by the RabbitMQ HTTP API](https://www.rabbitmq.com/management.html#single-listener-https)
 
@@ -44,6 +48,7 @@ Sample configuration snippet:
 
 ``` ini
 # these values are defaults
+prometheus.enable_metrics_aggregation = false
 prometheus.path = /metrics
 prometheus.tcp.port =  15692
 ```
diff --git a/deps/rabbitmq_prometheus/metrics.md b/deps/rabbitmq_prometheus/metrics.md
index 8c4f69f513..fff4820e0b 100644
--- a/deps/rabbitmq_prometheus/metrics.md
+++ b/deps/rabbitmq_prometheus/metrics.md
@@ -157,14 +157,14 @@
 
 ### Raft
 
-| Metric                               | Description                             |
-| ---                                  | ---                                     |
-| rabbitmq_raft_term_total             | Current Raft term number                |
-| rabbitmq_raft_log_snapshot_index     | Raft log snapshot index                 |
-| rabbitmq_raft_log_last_applied_index | Raft log last applied index             |
-| rabbitmq_raft_log_commit_index       | Raft log commit index                   |
-| rabbitmq_raft_log_last_written_index | Raft log last written index             |
-| rabbitmq_raft_entry_commit_latency   | Time taken for an entry to be committed |
+| Metric                                     | Description                             |
+| ---                                        | ---                                     |
+| rabbitmq_raft_term_total                   | Current Raft term number                |
+| rabbitmq_raft_log_snapshot_index           | Raft log snapshot index                 |
+| rabbitmq_raft_log_last_applied_index       | Raft log last applied index             |
+| rabbitmq_raft_log_commit_index             | Raft log commit index                   |
+| rabbitmq_raft_log_last_written_index       | Raft log last written index             |
+| rabbitmq_raft_entry_commit_latency_seconds | Time taken for an entry to be committed |
 
 ## Telemetry
 
diff --git a/deps/rabbitmq_prometheus/priv/schema/rabbitmq_prometheus.schema b/deps/rabbitmq_prometheus/priv/schema/rabbitmq_prometheus.schema
index 4f1028a2a6..bf8a9b4f1d 100644
--- a/deps/rabbitmq_prometheus/priv/schema/rabbitmq_prometheus.schema
+++ b/deps/rabbitmq_prometheus/priv/schema/rabbitmq_prometheus.schema
@@ -4,6 +4,10 @@
 %% See https://rabbitmq.com/prometheus.html for details
 %% ----------------------------------------------------------------------------
 
+%% Option to enable metrics aggregation
+{mapping, "prometheus.enable_metrics_aggregation", "rabbitmq_prometheus.enable_metrics_aggregation",
+    [{datatype, {enum, [true, false]}}]}.
+
 %% Endpoint path
 {mapping, "prometheus.path", "rabbitmq_prometheus.path",
     [{datatype, string}]}.
diff --git a/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl b/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
index 3fe3abcc90..c4e73de051 100644
--- a/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
+++ b/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
@@ -23,8 +23,7 @@
                                    create_mf/5,
                                    gauge_metric/2,
                                    counter_metric/2,
-                                   untyped_metric/2,
-                                   histogram_metric/4]).
+                                   untyped_metric/2]).
 
 -include_lib("prometheus/include/prometheus.hrl").
 -include_lib("rabbit_common/include/rabbit.hrl").
@@ -210,9 +209,6 @@
     {queue_metrics, queues, gauge, "Queues available"}
 ]).
 
--define(BUCKETS, [100, 300, 500, 750, 1000, 2500, 5000, 7500, 10000,
-                  30000, 60000, 90000, infinity]).
- 
 %%====================================================================
 %% Collector API
 %%====================================================================
@@ -223,7 +219,7 @@ register() ->
 deregister_cleanup(_) -> ok.
 
 collect_mf(_Registry, Callback) ->
-    {ok, Enable} = application:get_env(rabbitmq_prometheus, enable_metric_aggregation),
+    {ok, Enable} = application:get_env(rabbitmq_prometheus, enable_metrics_aggregation),
     [begin
          Data = get_data(Table, Enable),
          mf(Callback, Contents, Data)
@@ -343,8 +339,6 @@ metric(counter, Labels, Value) ->
     emit_counter_metric_if_defined(Labels, Value);
 metric(gauge, Labels, Value) ->
     emit_gauge_metric_if_defined(Labels, Value);
-metric(histogram, Labels, Value) ->
-    emit_histogram_metric(Labels, Value);
 metric(untyped, Labels, Value) ->
     untyped_metric(Labels, Value);
 metric(boolean, Labels, Value0) ->
@@ -381,11 +375,6 @@ emit_gauge_metric_if_defined(Labels, Value) ->
       gauge_metric(Labels, Value)
   end.
 
-emit_histogram_metric(Labels, {Buckets, Count, Sum}) ->
-    histogram_metric(Labels, maps:to_list(Buckets), Count, Sum);
-emit_histogram_metric(Labels, Value) ->
-    emit_gauge_metric_if_defined(Labels, Value).
-
 get_data(connection_metrics = Table, true) ->
     {Table, A1, A2, A3, A4} = ets:foldl(fun({_, Props}, {T, A1, A2, A3, A4}) ->
                                             {T,
@@ -451,7 +440,7 @@ get_data(Table, true) when Table == channel_exchange_metrics;
                            Table == channel_queue_exchange_metrics;
                            Table == ra_metrics;
                            Table == channel_process_metrics ->
-    [ets:foldl(fun({_, V1}, {T, A1}) ->
+    Result = ets:foldl(fun({_, V1}, {T, A1}) ->
                        {T, V1 + A1};
                   ({_, V1, _}, {T, A1}) ->
                        {T, V1 + A1};
@@ -462,14 +451,24 @@ get_data(Table, true) when Table == channel_exchange_metrics;
                   ({_, V1, V2, V3, V4}, {T, A1, A2, A3, A4}) ->
                        {T, V1 + A1, V2 + A2, V3 + A3, V4 + A4};
                   ({_, V1, V2, V3, V4, V5, V6}, {T, A1, A2, A3, A4, A5, A6}) ->
-                       {T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, V6 + A6};
+                       {T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, accumulate_count_and_sum(V6, A6)};
                   ({_, V1, V2, V3, V4, V5, V6, V7, _}, {T, A1, A2, A3, A4, A5, A6, A7}) ->
                        {T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, V6 + A6, V7 + A7}
-               end, empty(Table), Table)];
-
+               end, empty(Table), Table),
+    case Table of
+        %% raft_entry_commit_latency_seconds needs to be an average
+        ra_metrics ->
+            {Count, Sum} = element(7, Result),
+            [setelement(7, Result, Sum / Count)];
+        _ ->
+            [Result]
+    end;
 get_data(Table, _) ->
     ets:tab2list(Table).
 
+accumulate_count_and_sum(Value, {Count, Sum}) ->
+    {Count + 1, Sum + Value}.
+
 empty(T) when T == channel_queue_exchange_metrics; T == channel_process_metrics ->
     {T, 0};
 empty(T) when T == connection_coarse_metrics ->
@@ -477,7 +476,7 @@ empty(T) when T == connection_coarse_metrics ->
 empty(T) when T == channel_exchange_metrics; T == queue_coarse_metrics; T == connection_metrics ->
     {T, 0, 0, 0, 0};
 empty(T) when T == ra_metrics ->
-    {T, 0, 0, 0, 0, 0, 0};
+    {T, 0, 0, 0, 0, 0, {0, 0}};
 empty(T) when T == channel_queue_metrics; T == channel_metrics ->
     {T, 0, 0, 0, 0, 0, 0, 0};
 empty(queue_metrics = T) ->
diff --git a/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl b/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
index a9bd1da0a8..c6b20b3032 100644
--- a/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
+++ b/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
@@ -63,7 +63,7 @@ init_per_group(config_port, Config0) ->
     Config1 = rabbit_ct_helpers:merge_app_env(Config0, PathConfig),
     init_per_group(config_port, Config1, [{prometheus_port, 15772}]);
 init_per_group(with_aggregation, Config0) ->
-    PathConfig = {rabbitmq_prometheus, [{enable_metric_aggregation, true}]},
+    PathConfig = {rabbitmq_prometheus, [{enable_metrics_aggregation, true}]},
     Config1 = rabbit_ct_helpers:merge_app_env(Config0, PathConfig),
     init_per_group(with_metrics, Config1);
 init_per_group(with_metrics, Config0) ->
@@ -199,7 +199,6 @@ metrics_test(Config) ->
     ?assertEqual(match, re:run(Body, "^rabbitmq_process_max_fds ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_io_read_ops_total ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_raft_term_total{", [{capture, none}, multiline])),
-    ?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds{", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_queue_messages_ready{", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_queue_consumers{", [{capture, none}, multiline])),
     %% Checking the first metric value in each ETS table that requires converting
@@ -228,14 +227,15 @@ aggregated_metrics_test(Config) ->
     ?assertEqual(match, re:run(Body, "^rabbitmq_process_max_fds ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_io_read_ops_total ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_raft_term_total ", [{capture, none}, multiline])),
-    ?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_queue_messages_ready ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_queue_consumers ", [{capture, none}, multiline])),
     %% Checking the first metric value in each ETS table that requires converting
     ?assertEqual(match, re:run(Body, "^rabbitmq_erlang_uptime_seconds ", [{capture, none}, multiline])),
     ?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
     %% Checking the first TOTALS metric value
-    ?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])).
+    ?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])),
+    %% Checking raft_entry_commit_latency_seconds because we are aggregating it
+    ?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds ", [{capture, none}, multiline])).
 
 build_info_test(Config) ->
     {_Headers, Body} = http_get(Config, [], 200),