Skip to content

Commit

Permalink
mgr/dashboard: monitoring: refactor into ceph-mixin
Browse files Browse the repository at this point in the history
Mixin is a way to bundle dashboards, prometheus rules and alerts into
jsonnet package. Shifting to mixin will allow easier integration with
monitoring automation that some users may use.

This commit moves `/monitoring/grafana/dashboards` and
`/monitoring/prometheus` to `/monitoring/ceph-mixin`. Prometheus alerts
was also converted to Jsonnet using an automated way (from yaml to json
to jsonnet). This commit minimises any change made to the generated files
and should not change neithers the dashboards nor the Prometheus alerts.

In the future some configuration will also be added to jsonnet to add
more functionalities to the dashboards or alerts (i.e.: multi cluster).

Fixes: https://tracker.ceph.com/issues/53374
Signed-off-by: Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@cern.ch>
  • Loading branch information
MrFreezeex committed Feb 3, 2022
1 parent e102620 commit 98236e3
Show file tree
Hide file tree
Showing 81 changed files with 3,269 additions and 1,748 deletions.
6 changes: 1 addition & 5 deletions CMakeLists.txt
Expand Up @@ -663,8 +663,6 @@ option(WITH_SYSTEMD "build with systemd support" ON)
add_subdirectory(src)

add_subdirectory(qa)
add_subdirectory(monitoring)

add_subdirectory(doc)
if(WITH_MANPAGE)
add_subdirectory(man)
Expand All @@ -679,9 +677,7 @@ if(LINUX)
endif()

option(WITH_GRAFANA "install grafana dashboards" OFF)
if(WITH_GRAFANA)
add_subdirectory(monitoring/grafana/dashboards)
endif()
add_subdirectory(monitoring/ceph-mixin)

CMAKE_DEPENDENT_OPTION(WITH_BOOST_VALGRIND "Boost support for valgrind" OFF
"NOT WITH_SYSTEM_BOOST" OFF)
Expand Down
4 changes: 1 addition & 3 deletions ceph.spec.in
Expand Up @@ -1434,7 +1434,7 @@ mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rbd
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rbd-mirror

# prometheus alerts
install -m 644 -D monitoring/prometheus/alerts/ceph_default_alerts.yml %{buildroot}/etc/prometheus/ceph/ceph_default_alerts.yml
install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yaml %{buildroot}/etc/prometheus/ceph/ceph_default_alerts.yml

%if 0%{?suse_version}
# create __pycache__ directories and their contents
Expand Down Expand Up @@ -2503,8 +2503,6 @@ exit 0
%endif
%attr(0755,root,root) %dir %{_sysconfdir}/grafana/dashboards/ceph-dashboard
%config %{_sysconfdir}/grafana/dashboards/ceph-dashboard/*
%doc monitoring/grafana/dashboards/README
%doc monitoring/grafana/README.md

%files prometheus-alerts
%if 0%{?suse_version}
Expand Down
2 changes: 1 addition & 1 deletion debian/rules
Expand Up @@ -73,7 +73,7 @@ override_dh_auto_install:

install -m 755 src/cephadm/cephadm $(DESTDIR)/usr/sbin/cephadm

install -m 644 -D monitoring/prometheus/alerts/ceph_default_alerts.yml $(DESTDIR)/etc/prometheus/ceph/ceph_default_alerts.yml
install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yaml $(DESTDIR)/etc/prometheus/ceph/ceph_default_alerts.yml

# doc/changelog is a directory, which confuses dh_installchangelogs
override_dh_installchangelogs:
Expand Down
1 change: 0 additions & 1 deletion monitoring/CMakeLists.txt

This file was deleted.

1 change: 1 addition & 0 deletions monitoring/ceph-mixin/.gitignore
@@ -0,0 +1 @@
vendor
1 change: 1 addition & 0 deletions monitoring/ceph-mixin/.pylintrc
53 changes: 53 additions & 0 deletions monitoring/ceph-mixin/CMakeLists.txt
@@ -0,0 +1,53 @@
if(WITH_GRAFANA)
set(CEPH_GRAFANA_DASHBOARDS_DIR "${CMAKE_INSTALL_SYSCONFDIR}/grafana/dashboards/ceph-dashboard"
CACHE PATH "Location for grafana dashboards")
file(GLOB CEPH_GRAFANA_DASHBOARDS "dashboards_out/*.json")
install(FILES
${CEPH_GRAFANA_DASHBOARDS}
DESTINATION ${CEPH_GRAFANA_DASHBOARDS_DIR})
if(WITH_TESTS)
ExternalProject_Add(jsonnet-bundler
GIT_REPOSITORY "https://github.com/jsonnet-bundler/jsonnet-bundler.git"
GIT_TAG "v0.4.0"
GIT_SHALLOW TRUE
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/jsonnet-bundler
CONFIGURE_COMMAND ""
DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
BUILD_COMMAND make build
BUILD_IN_SOURCE 1
INSTALL_COMMAND cp <SOURCE_DIR>/_output/jb <INSTALL_DIR>)

set(CEPH_BUILD_VIRTUALENV $ENV{TMPDIR})
if(NOT CEPH_BUILD_VIRTUALENV)
include(AddCephTest)
set(CEPH_BUILD_VIRTUALENV ${CMAKE_BINARY_DIR})
add_tox_test(grafana-lint TOX_ENVS lint)
add_tox_test(jsonnet-lint TOX_ENVS jsonnet-lint)
add_tox_test(jsonnet-check TOX_ENVS jsonnet-check)
add_tox_test(alerts-check TOX_ENVS alerts-check)
add_tox_test(alerts-lint TOX_ENVS alerts-lint)
add_tox_test(promql-query-test TOX_ENVS promql-query-test)
endif()

if(DEFINED PROMTOOL_EXECUTABLE)
set(promtool_executable_checked TRUE)
endif()

find_program(PROMTOOL_EXECUTABLE promtool)
if(PROMTOOL_EXECUTABLE)
execute_process(
COMMAND ${PROMTOOL_EXECUTABLE} test rules /dev/null
RESULT_VARIABLE rc
OUTPUT_QUIET)
if(NOT rc)
add_ceph_test(run-promtool-unittests
${PROMTOOL_EXECUTABLE} test rules ${CMAKE_SOURCE_DIR}/monitoring/ceph-mixin/tests_alerts/test_alerts.yml)
elseif(NOT promtool_executable_checked)
message(WARNING "'${PROMTOOL_EXECUTABLE} test rules' does not work, "
"please use a newer prometheus")
endif()
elseif(NOT promtool_executable_checked)
message(WARNING "run-promtool-unittests is skipped due to missing promtool")
endif()
endif()
endif()
24 changes: 24 additions & 0 deletions monitoring/ceph-mixin/Makefile
@@ -0,0 +1,24 @@
all: fmt generate lint test

fmt:
./lint-jsonnet.sh -i

generate: dashboards_out

vendor: jsonnetfile.lock.json
tox -ejsonnet-bundler-install

dashboards_out: vendor $(JSONNETS_FILES)
tox -ejsonnet-fix

lint:
tox -ejsonnet-lint
tox -ealerts-lint

test: generate
tox -ejsonnet-check
tox -epromql-query-test
tox -ealerts-check
check: test

.PHONY: all fmt generate lint test check
52 changes: 52 additions & 0 deletions monitoring/ceph-mixin/README.md
@@ -0,0 +1,52 @@
## Prometheus Monitoring Mixin for Ceph
A set of Grafana dashboards and Prometheus alerts for Ceph.

All the Grafana dashboards are already generated in the `dashboards_out`
directory and alerts in the `prometheus_alerts.yaml` file.

You can use the Grafana dashboards and alerts with Jsonnet like any other
prometheus mixin. You can find more ressources about mixins in general on
[monitoring.mixins.dev](https://monitoring.mixins.dev/).

### Grafana dashboards for Ceph
In `dashboards_out` you can find a collection of
[Grafana](https://grafana.com/grafana) dashboards for Ceph Monitoring.

These dashboards are based on metrics collected
from [prometheus](https://prometheus.io/) scraping the [prometheus mgr
plugin](http://docs.ceph.com/en/latest/mgr/prometheus/) and the
[node_exporter](https://github.com/prometheus/node_exporter).

#### Requirements

- [Status Panel](https://grafana.com/plugins/vonage-status-panel) installed on
your Grafana instance
- [Pie Chart Panel](https://grafana.com/grafana/plugins/grafana-piechart-panel/)
installed on your Grafana instance


### Prometheus alerts
In `prometheus_alerts.yaml` you'll find a set of Prometheus
alert rules that should provide a decent set of default alerts for a
Ceph cluster. Just put this file in a place according to your Prometheus
configuration (wherever the `rules` configuration stanza points).

#### SNMP
Ceph provides a MIB (CEPH-PROMETHEUS-ALERT-MIB.txt) to support sending Prometheus
alerts through to an SNMP management platform. The translation from Prometheus
alert to SNMP trap requires the Prometheus alert to contain an OID that maps to
a definition within the MIB. When making changes to the Prometheus alert rules
file, developers should include any necessary changes to the MIB.

### Building from Jsonnet

- Install [jsonnet](https://jsonnet.org/)
- By installing the package `jsonnet` in most of the distro and
`golang-github-google-jsonnet` in fedora
- Install [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler)

To rebuild all the generated files, you can run `tox -egrafonnet-fix`.

The jsonnet code located in this directory depends on some Jsonnet third party
libraries. To update those libraries you can run `jb update` and then update
the generated files using `tox -egrafonnet-fix`.
3 changes: 3 additions & 0 deletions monitoring/ceph-mixin/alerts.libsonnet
@@ -0,0 +1,3 @@
{
prometheusAlerts+:: std.parseYaml(importstr 'prometheus_alerts.yaml'),
}
1 change: 1 addition & 0 deletions monitoring/ceph-mixin/config.libsonnet
@@ -0,0 +1 @@
{}
6 changes: 6 additions & 0 deletions monitoring/ceph-mixin/dashboards.jsonnet
@@ -0,0 +1,6 @@
local dashboards = (import 'mixin.libsonnet').grafanaDashboards;

{
[name]: dashboards[name]
for name in std.objectFields(dashboards)
}
103 changes: 103 additions & 0 deletions monitoring/ceph-mixin/dashboards/cephfs.libsonnet
@@ -0,0 +1,103 @@
local g = import 'grafonnet/grafana.libsonnet';
local u = import 'utils.libsonnet';

{
grafanaDashboards+:: {
'cephfs-overview.json':
local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) =
u.graphPanelSchema({},
title,
'',
'null',
false,
formatY1,
'short',
labelY1,
null,
0,
1,
'$datasource')
.addTargets(
[u.addTargetSchema(expr, 1, 'time_series', legendFormat)]
) + { gridPos: { x: x, y: y, w: w, h: h } };

u.dashboardSchema(
'MDS Performance',
'',
'tbO9LAiZz',
'now-1h',
'15s',
16,
[],
'',
{
refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
}
)
.addAnnotation(
u.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.3.2'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
)
.addTemplate(
u.addTemplateSchema('mds_servers',
'$datasource',
'label_values(ceph_mds_inodes, ceph_daemon)',
1,
true,
1,
'MDS Server',
'')
)
.addPanels([
u.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
CephfsOverviewGraphPanel(
'MDS Workload - $mds_servers',
'none',
'Reads(-) / Writes (+)',
'sum(rate(ceph_objecter_op_r{ceph_daemon=~"($mds_servers).*"}[1m]))',
'Read Ops',
0,
1,
12,
9
)
.addTarget(u.addTargetSchema(
'sum(rate(ceph_objecter_op_w{ceph_daemon=~"($mds_servers).*"}[1m]))',
1,
'time_series',
'Write Ops'
))
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
CephfsOverviewGraphPanel(
'Client Request Load - $mds_servers',
'none',
'Client Requests',
'ceph_mds_server_handle_client_request{ceph_daemon=~"($mds_servers).*"}',
'{{ceph_daemon}}',
12,
1,
12,
9
),
]),
},
}
6 changes: 6 additions & 0 deletions monitoring/ceph-mixin/dashboards/dashboards.libsonnet
@@ -0,0 +1,6 @@
(import 'cephfs.libsonnet') +
(import 'host.libsonnet') +
(import 'osd.libsonnet') +
(import 'pool.libsonnet') +
(import 'rbd.libsonnet') +
(import 'rgw.libsonnet')

0 comments on commit 98236e3

Please sign in to comment.