Skip to content

Commit

Permalink
Add alarms prometheus collector.
Browse files Browse the repository at this point in the history
close #2653

(cherry picked from commit 9fed915)

# Conflicts:
#	deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl
#	release-notes/3.9.4.md
  • Loading branch information
deadtrickster authored and mergify-bot committed Aug 17, 2021
1 parent 9ea36a2 commit 0c0b1e3
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
%% This Source Code Form is subject to the terms of the Mozilla Public
%% License, v. 2.0. If a copy of the MPL was not distributed with this
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
%%
%% Copyright (c) 2007-2021 VMware, Inc. or its affiliates. All rights reserved.
%%
-module(prometheus_rabbitmq_alarm_metrics_collector).

-export([register/0, deregister_cleanup/1, collect_mf/2]).

-import(prometheus_model_helpers, [create_mf/4, untyped_metric/1]).

-include_lib("prometheus/include/prometheus.hrl").

-behaviour(prometheus_collector).

-define(METRIC_NAME_PREFIX, "rabbitmq_alarms_").

%%====================================================================
%% Collector API
%%====================================================================

register() ->
ok = prometheus_registry:register_collector(?MODULE).

deregister_cleanup(_) ->
ok.

-spec collect_mf(_Registry, Callback) -> ok
when _Registry :: prometheus_registry:registry(),
Callback :: prometheus_collector:callback().
collect_mf(_Registry, Callback) ->
try
case rabbit_alarm:get_local_alarms(500) %% TODO: figure out timeout
of
Alarms when is_list(Alarms) ->
ActiveAlarms =
lists:foldl(fun ({{resource_limit, disk, _}, _}, Acc) ->
maps:put(disk_limit, 1, Acc);
({{resource_limit, memory, _}, _}, Acc) ->
maps:put(memory_limit, 1, Acc);
({file_descriptor_limit, _}, Acc) ->
maps:put(file_descriptor_limit, 1, Acc)
end,
#{},
Alarms),

Callback(create_mf(?METRIC_NAME(<<"file_descriptor_limit">>),
<<"is 1 if file descriptor limit alarm is in effect">>,
untyped,
[untyped_metric(maps:get(file_descriptor_limit,
ActiveAlarms,
0))])),
Callback(create_mf(?METRIC_NAME(<<"disk_limit">>),
<<"is 1 if disk alarm is in effect">>,
untyped,
[untyped_metric(maps:get(disk_limit, ActiveAlarms, 0))])),
Callback(create_mf(?METRIC_NAME(<<"memory_limit">>),
<<"is 1 if memory alarm is in effect">>,
untyped,
[untyped_metric(maps:get(memory_limit, ActiveAlarms, 0))])),
ok;
Error ->
rabbit_log:error("alarm_metrics_collector failed to emit metrics: "
"rabbitm_alarm:get_local_alarms returned ~p",
[Error]),
%% We are not going to render any alarm metrics here.
%% Breaks continuity but at least doesn't crash the
%% whole scraping endpoint
ok
end
catch
exit:{timeout, _} ->
rabbit_log:error("alarm_metrics_collector failed to emit metrics: "
"rabbitm_alarm:get_local_alarms timed out"),
%% We are not going to render any alarm metrics here.
%% Breaks continuity but at least doesn't crash the
%% whole scraping endpoint
ok
end.
7 changes: 7 additions & 0 deletions deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,14 @@

build_dispatcher() ->
{ok, _} = application:ensure_all_started(prometheus),
<<<<<<< HEAD
prometheus_registry:register_collectors([prometheus_rabbitmq_core_metrics_collector]),
=======
prometheus_registry:register_collectors([
prometheus_rabbitmq_core_metrics_collector,
prometheus_rabbitmq_global_metrics_collector,
prometheus_rabbitmq_alarm_metrics_collector]),
>>>>>>> 9fed915192 (Add alarms prometheus collector.)
prometheus_registry:register_collectors('per-object', [
prometheus_vm_system_info_collector,
prometheus_vm_dist_collector,
Expand Down
42 changes: 42 additions & 0 deletions release-notes/3.9.4.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
RabbitMQ `3.9.4` is a maintenance release in the `3.9.x` release series.

Please refer to the **Upgrading to 3.9** section from [v3.9.0 release notes](https://github.com/rabbitmq/rabbitmq-server/releases/tag/v3.9.0) if upgrading from a version prior to 3.9.0.

This release requires at least Erlang 23.2, and supports the latest Erlang 24 version, 24.0.5 at the time of release. [RabbitMQ and Erlang/OTP Compatibility Matrix](https://www.rabbitmq.com/which-erlang.html) has more details on Erlang version requirements for RabbitMQ.


## Changes Worth Mentioning

Release notes are kept under [rabbitmq-server/release-notes](https://github.com/rabbitmq/rabbitmq-server/tree/v3.9.x/release-notes).
Contributors are encouraged to update them together with their changes. This helps with release automation and more
consistent release schedule.

### Core Server

#### Enhancements

* New Prometheus metrics for alarms:
* `rabbitmq_alarms_file_descriptor_limit` 1|0
* `rabbitmq_alarms_disk_limit` 1|0
* `rabbitmq_alarms_memory_limit` 1|0

While some of the alarms have cluster-wide effect, these metrics are node-local.

GitHub issue: [#2653](https://github.com/rabbitmq/rabbitmq-server/pull/2653)

* Nodes will now use four more environment variables, if set: `RABBITMQ_DEFAULT_USER` (overrides `default_user` in `rabbitmq.conf`), `RABBITMQ_DEFAULT_PASS` (overrides `default_pass`), `RABBITMQ_DEFAULT_VHOST` (overrides `default_vhost`) and `RABBITMQ_ERLANG_COOKIE` (sets [shared authentication secret value](https://www.rabbitmq.com/clustering.html#erlang-cookie)).
These variables **are not recommended to be used in production** but can be the only realistic option in some environment, such as service containers, ECS, and so on.
Most users should continue using `rabbitmq.conf` and a securely generated local cookie file.

GitHub issue: [#3299](https://github.com/rabbitmq/rabbitmq-server/pull/3299)



## Dependency Upgrades

No dependency changes in this release.


## Source Code Archives

To obtain source code of the entire distribution, please download the archive named `rabbitmq-server-3.9.4.tar.xz` instead of the source tarball produced by GitHub.

0 comments on commit 0c0b1e3

Please sign in to comment.