diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c8c8396d1c61..bb42aa69a6880 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -663,8 +663,6 @@ option(WITH_SYSTEMD "build with systemd support" ON) add_subdirectory(src) add_subdirectory(qa) -add_subdirectory(monitoring) - add_subdirectory(doc) if(WITH_MANPAGE) add_subdirectory(man) @@ -679,9 +677,7 @@ if(LINUX) endif() option(WITH_GRAFANA "install grafana dashboards" OFF) -if(WITH_GRAFANA) - add_subdirectory(monitoring/grafana/dashboards) -endif() +add_subdirectory(monitoring/ceph-mixin) CMAKE_DEPENDENT_OPTION(WITH_BOOST_VALGRIND "Boost support for valgrind" OFF "NOT WITH_SYSTEM_BOOST" OFF) diff --git a/ceph.spec.in b/ceph.spec.in index ca00935dd8259..fba77950cf422 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -1434,7 +1434,7 @@ mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rbd mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rbd-mirror # prometheus alerts -install -m 644 -D monitoring/prometheus/alerts/ceph_default_alerts.yml %{buildroot}/etc/prometheus/ceph/ceph_default_alerts.yml +install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yaml %{buildroot}/etc/prometheus/ceph/ceph_default_alerts.yml %if 0%{?suse_version} # create __pycache__ directories and their contents @@ -2503,8 +2503,6 @@ exit 0 %endif %attr(0755,root,root) %dir %{_sysconfdir}/grafana/dashboards/ceph-dashboard %config %{_sysconfdir}/grafana/dashboards/ceph-dashboard/* -%doc monitoring/grafana/dashboards/README -%doc monitoring/grafana/README.md %files prometheus-alerts %if 0%{?suse_version} diff --git a/debian/rules b/debian/rules index 3ad9f248b7bf1..0c3939745be68 100755 --- a/debian/rules +++ b/debian/rules @@ -73,7 +73,7 @@ override_dh_auto_install: install -m 755 src/cephadm/cephadm $(DESTDIR)/usr/sbin/cephadm - install -m 644 -D monitoring/prometheus/alerts/ceph_default_alerts.yml $(DESTDIR)/etc/prometheus/ceph/ceph_default_alerts.yml + install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yaml $(DESTDIR)/etc/prometheus/ceph/ceph_default_alerts.yml # doc/changelog is a directory, which confuses dh_installchangelogs override_dh_installchangelogs: diff --git a/monitoring/CMakeLists.txt b/monitoring/CMakeLists.txt deleted file mode 100644 index 7d0155c5f13e9..0000000000000 --- a/monitoring/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(prometheus) diff --git a/monitoring/ceph-mixin/.gitignore b/monitoring/ceph-mixin/.gitignore new file mode 100644 index 0000000000000..22d0d82f8095e --- /dev/null +++ b/monitoring/ceph-mixin/.gitignore @@ -0,0 +1 @@ +vendor diff --git a/monitoring/ceph-mixin/.pylintrc b/monitoring/ceph-mixin/.pylintrc new file mode 120000 index 0000000000000..26d91e4cd8665 --- /dev/null +++ b/monitoring/ceph-mixin/.pylintrc @@ -0,0 +1 @@ +../../src/pybind/mgr/dashboard/.pylintrc \ No newline at end of file diff --git a/monitoring/ceph-mixin/CMakeLists.txt b/monitoring/ceph-mixin/CMakeLists.txt new file mode 100644 index 0000000000000..8621c26734250 --- /dev/null +++ b/monitoring/ceph-mixin/CMakeLists.txt @@ -0,0 +1,53 @@ +if(WITH_GRAFANA) + set(CEPH_GRAFANA_DASHBOARDS_DIR "${CMAKE_INSTALL_SYSCONFDIR}/grafana/dashboards/ceph-dashboard" + CACHE PATH "Location for grafana dashboards") + file(GLOB CEPH_GRAFANA_DASHBOARDS "dashboards_out/*.json") + install(FILES + ${CEPH_GRAFANA_DASHBOARDS} + DESTINATION ${CEPH_GRAFANA_DASHBOARDS_DIR}) + if(WITH_TESTS) + ExternalProject_Add(jsonnet-bundler + GIT_REPOSITORY "https://github.com/jsonnet-bundler/jsonnet-bundler.git" + GIT_TAG "v0.4.0" + GIT_SHALLOW TRUE + SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/jsonnet-bundler + CONFIGURE_COMMAND "" + DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + BUILD_COMMAND make build + BUILD_IN_SOURCE 1 + INSTALL_COMMAND cp /_output/jb ) + + set(CEPH_BUILD_VIRTUALENV $ENV{TMPDIR}) + if(NOT CEPH_BUILD_VIRTUALENV) + include(AddCephTest) + set(CEPH_BUILD_VIRTUALENV ${CMAKE_BINARY_DIR}) + add_tox_test(grafana-lint TOX_ENVS lint) + add_tox_test(jsonnet-lint TOX_ENVS jsonnet-lint) + add_tox_test(jsonnet-check TOX_ENVS jsonnet-check) + add_tox_test(alerts-check TOX_ENVS alerts-check) + add_tox_test(alerts-lint TOX_ENVS alerts-lint) + add_tox_test(promql-query-test TOX_ENVS promql-query-test) + endif() + + if(DEFINED PROMTOOL_EXECUTABLE) + set(promtool_executable_checked TRUE) + endif() + + find_program(PROMTOOL_EXECUTABLE promtool) + if(PROMTOOL_EXECUTABLE) + execute_process( + COMMAND ${PROMTOOL_EXECUTABLE} test rules /dev/null + RESULT_VARIABLE rc + OUTPUT_QUIET) + if(NOT rc) + add_ceph_test(run-promtool-unittests + ${PROMTOOL_EXECUTABLE} test rules ${CMAKE_SOURCE_DIR}/monitoring/ceph-mixin/tests_alerts/test_alerts.yml) + elseif(NOT promtool_executable_checked) + message(WARNING "'${PROMTOOL_EXECUTABLE} test rules' does not work, " + "please use a newer prometheus") + endif() + elseif(NOT promtool_executable_checked) + message(WARNING "run-promtool-unittests is skipped due to missing promtool") + endif() + endif() +endif() diff --git a/monitoring/ceph-mixin/Makefile b/monitoring/ceph-mixin/Makefile new file mode 100644 index 0000000000000..44575b77eb6a4 --- /dev/null +++ b/monitoring/ceph-mixin/Makefile @@ -0,0 +1,24 @@ +all: fmt generate lint test + +fmt: + ./lint-jsonnet.sh -i + +generate: dashboards_out + +vendor: jsonnetfile.lock.json + tox -ejsonnet-bundler-install + +dashboards_out: vendor $(JSONNETS_FILES) + tox -ejsonnet-fix + +lint: + tox -ejsonnet-lint + tox -ealerts-lint + +test: generate + tox -ejsonnet-check + tox -epromql-query-test + tox -ealerts-check +check: test + +.PHONY: all fmt generate lint test check diff --git a/monitoring/ceph-mixin/README.md b/monitoring/ceph-mixin/README.md new file mode 100644 index 0000000000000..164b73b881c66 --- /dev/null +++ b/monitoring/ceph-mixin/README.md @@ -0,0 +1,52 @@ +## Prometheus Monitoring Mixin for Ceph +A set of Grafana dashboards and Prometheus alerts for Ceph. + +All the Grafana dashboards are already generated in the `dashboards_out` +directory and alerts in the `prometheus_alerts.yaml` file. + +You can use the Grafana dashboards and alerts with Jsonnet like any other +prometheus mixin. You can find more ressources about mixins in general on +[monitoring.mixins.dev](https://monitoring.mixins.dev/). + +### Grafana dashboards for Ceph +In `dashboards_out` you can find a collection of +[Grafana](https://grafana.com/grafana) dashboards for Ceph Monitoring. + +These dashboards are based on metrics collected +from [prometheus](https://prometheus.io/) scraping the [prometheus mgr +plugin](http://docs.ceph.com/en/latest/mgr/prometheus/) and the +[node_exporter](https://github.com/prometheus/node_exporter). + +#### Requirements + +- [Status Panel](https://grafana.com/plugins/vonage-status-panel) installed on + your Grafana instance +- [Pie Chart Panel](https://grafana.com/grafana/plugins/grafana-piechart-panel/) + installed on your Grafana instance + + +### Prometheus alerts +In `prometheus_alerts.yaml` you'll find a set of Prometheus +alert rules that should provide a decent set of default alerts for a +Ceph cluster. Just put this file in a place according to your Prometheus +configuration (wherever the `rules` configuration stanza points). + +#### SNMP +Ceph provides a MIB (CEPH-PROMETHEUS-ALERT-MIB.txt) to support sending Prometheus +alerts through to an SNMP management platform. The translation from Prometheus +alert to SNMP trap requires the Prometheus alert to contain an OID that maps to +a definition within the MIB. When making changes to the Prometheus alert rules +file, developers should include any necessary changes to the MIB. + +### Building from Jsonnet + +- Install [jsonnet](https://jsonnet.org/) + - By installing the package `jsonnet` in most of the distro and + `golang-github-google-jsonnet` in fedora +- Install [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler) + +To rebuild all the generated files, you can run `tox -egrafonnet-fix`. + +The jsonnet code located in this directory depends on some Jsonnet third party +libraries. To update those libraries you can run `jb update` and then update +the generated files using `tox -egrafonnet-fix`. diff --git a/monitoring/ceph-mixin/alerts.libsonnet b/monitoring/ceph-mixin/alerts.libsonnet new file mode 100644 index 0000000000000..8671637de5d5d --- /dev/null +++ b/monitoring/ceph-mixin/alerts.libsonnet @@ -0,0 +1,3 @@ +{ + prometheusAlerts+:: std.parseYaml(importstr 'prometheus_alerts.yaml'), +} diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet new file mode 100644 index 0000000000000..0967ef424bce6 --- /dev/null +++ b/monitoring/ceph-mixin/config.libsonnet @@ -0,0 +1 @@ +{} diff --git a/monitoring/ceph-mixin/dashboards.jsonnet b/monitoring/ceph-mixin/dashboards.jsonnet new file mode 100644 index 0000000000000..9d913ed3f18c4 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/monitoring/ceph-mixin/dashboards/cephfs.libsonnet b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet new file mode 100644 index 0000000000000..3dabc1608ad35 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet @@ -0,0 +1,103 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'cephfs-overview.json': + local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) = + u.graphPanelSchema({}, + title, + '', + 'null', + false, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, 1, 'time_series', legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'MDS Performance', + '', + 'tbO9LAiZz', + 'now-1h', + '15s', + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('mds_servers', + '$datasource', + 'label_values(ceph_mds_inodes, ceph_daemon)', + 1, + true, + 1, + 'MDS Server', + '') + ) + .addPanels([ + u.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + CephfsOverviewGraphPanel( + 'MDS Workload - $mds_servers', + 'none', + 'Reads(-) / Writes (+)', + 'sum(rate(ceph_objecter_op_r{ceph_daemon=~"($mds_servers).*"}[1m]))', + 'Read Ops', + 0, + 1, + 12, + 9 + ) + .addTarget(u.addTargetSchema( + 'sum(rate(ceph_objecter_op_w{ceph_daemon=~"($mds_servers).*"}[1m]))', + 1, + 'time_series', + 'Write Ops' + )) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + CephfsOverviewGraphPanel( + 'Client Request Load - $mds_servers', + 'none', + 'Client Requests', + 'ceph_mds_server_handle_client_request{ceph_daemon=~"($mds_servers).*"}', + '{{ceph_daemon}}', + 12, + 1, + 12, + 9 + ), + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/dashboards.libsonnet b/monitoring/ceph-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 0000000000000..72ca483248f8e --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,6 @@ +(import 'cephfs.libsonnet') + +(import 'host.libsonnet') + +(import 'osd.libsonnet') + +(import 'pool.libsonnet') + +(import 'rbd.libsonnet') + +(import 'rgw.libsonnet') diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet new file mode 100644 index 0000000000000..b2ee5c94f0043 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/host.libsonnet @@ -0,0 +1,562 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'hosts-overview.json': + local HostsOverviewSingleStatPanel(format, + title, + description, + valueName, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget( + u.addTargetSchema(expr, 1, targetFormat, '') + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local HostsOverviewGraphPanel(title, description, formatY1, expr, legendFormat, x, y, w, h) = + u.graphPanelSchema( + {}, title, description, 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource' + ) + .addTargets( + [u.addTargetSchema( + expr, 1, 'time_series', legendFormat + )] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'Host Overview', + '', + 'y0KGL0iZz', + 'now-1h', + '10s', + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('osd_hosts', + '$datasource', + 'label_values(ceph_disk_occupation, exported_instance)', + 1, + true, + 1, + null, + '([^.]*).*') + ) + .addTemplate( + u.addTemplateSchema('mon_hosts', + '$datasource', + 'label_values(ceph_mon_metadata, ceph_daemon)', + 1, + true, + 1, + null, + 'mon.(.*)') + ) + .addTemplate( + u.addTemplateSchema('mds_hosts', + '$datasource', + 'label_values(ceph_mds_inodes, ceph_daemon)', + 1, + true, + 1, + null, + 'mds.(.*)') + ) + .addTemplate( + u.addTemplateSchema('rgw_hosts', + '$datasource', + 'label_values(ceph_rgw_metadata, ceph_daemon)', + 1, + true, + 1, + null, + 'rgw.(.*)') + ) + .addPanels([ + HostsOverviewSingleStatPanel( + 'none', + 'OSD Hosts', + '', + 'current', + 'count(sum by (hostname) (ceph_osd_metadata))', + 'time_series', + 0, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percentunit', + 'AVG CPU Busy', + 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster', + 'current', + 'avg(\n 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n irate(node_cpu{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n )\n )', + 'time_series', + 4, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percentunit', + 'AVG RAM Utilization', + 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)', + 'current', + 'avg (((node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"})- (\n (node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + \n (node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + \n (node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +\n (node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"})\n )) /\n (node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} ))', + 'time_series', + 8, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'none', + 'Physical IOPS', + 'IOPS Load at the device as reported by the OS on all OSD hosts', + 'current', + 'sum ((irate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[5m]) ) + \n(irate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[5m]) or irate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[5m])))', + 'time_series', + 12, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percent', + 'AVG Disk Utilization', + 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)', + 'current', + 'avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), "instance", "$1", "instance", "([^.:]*).*"\n ) *\n on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($osd_hosts).*"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^.:]*).*")\n)', + 'time_series', + 16, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'bytes', + 'Network Load', + 'Total send/receive network load across all hosts in the ceph cluster', + 'current', + ||| + sum ( + ( + irate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or + irate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) + + sum ( + ( + irate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or + irate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) + ||| + , + 'time_series', + 20, + 0, + 4, + 5 + ), + HostsOverviewGraphPanel( + 'CPU Busy - Top 10 Hosts', + 'Show the top 10 busiest hosts by cpu', + 'percent', + 'topk(10,100 * ( 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n irate(node_cpu{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n )\n )\n)', + '{{instance}}', + 0, + 5, + 12, + 9 + ), + HostsOverviewGraphPanel( + 'Network Load - Top 10 Hosts', 'Top 10 hosts by network load', 'Bps', ||| + topk(10, (sum by(instance) ( + ( + irate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or + irate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) + ) + + ( + irate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or + irate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")) + )) + ||| + , '{{instance}}', 12, 5, 12, 9 + ), + ]), + 'host-details.json': + local HostDetailsSingleStatPanel(format, + title, + description, + valueName, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget(u.addTargetSchema(expr, + 1, + targetFormat, + '')) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local HostDetailsGraphPanel(alias, + title, + description, + nullPointMode, + formatY1, + labelY1, + expr, + legendFormat, + x, + y, + w, + h) = + u.graphPanelSchema(alias, + title, + description, + nullPointMode, + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, + 1, + 'time_series', + legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'Host Details', + '', + 'rtOg0AiWz', + 'now-1h', + '10s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + u.addAnnotationSchema( + 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('ceph_hosts', '$datasource', 'label_values(node_scrape_collector_success, instance) ', 1, false, 3, 'Hostname', '([^.:]*).*') + ) + .addPanels([ + u.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + HostDetailsSingleStatPanel( + 'none', + 'OSDs', + '', + 'current', + "count(sum by (ceph_daemon) (ceph_osd_metadata{hostname='$ceph_hosts'}))", + 'time_series', + 0, + 1, + 3, + 5 + ), + HostDetailsGraphPanel( + { + interrupt: '#447EBC', + steal: '#6D1F62', + system: '#890F02', + user: '#3F6833', + wait: '#C15C17', + }, 'CPU Utilization', "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", 'null', 'percent', '% Utilization', 'sum by (mode) (\n irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m]) or\n irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m])\n) / scalar(\n sum(irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]) or\n irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]))\n) * 100', '{{mode}}', 3, 1, 6, 10 + ), + HostDetailsGraphPanel( + { + Available: '#508642', + Free: '#508642', + Total: '#bf1b00', + Used: '#bf1b00', + total: '#bf1b00', + used: '#0a50a1', + }, + 'RAM Usage', + '', + 'null', + 'bytes', + 'RAM used', + 'node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ', + 'Free', + 9, + 1, + 6, + 10 + ) + .addTargets( + [ + u.addTargetSchema('node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ', 1, 'time_series', 'total'), + u.addTargetSchema('(node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n(node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) +\n(node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) \n', 1, 'time_series', 'buffers/cache'), + u.addTargetSchema('(node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"})- (\n (node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n (node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n (node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) +\n (node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"})\n )\n \n', 1, 'time_series', 'used'), + ] + ) + .addSeriesOverride( + { + alias: 'total', + color: '#bf1b00', + fill: 0, + linewidth: 2, + stack: false, + } + ), + HostDetailsGraphPanel( + {}, + 'Network Load', + "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", + 'null', + 'decbytes', + 'Send (-) / Receive (+)', + 'sum by (device) (\n irate(node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or \n irate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m])\n)', + '{{device}}.rx', + 15, + 1, + 6, + 10 + ) + .addTargets( + [ + u.addTargetSchema('sum by (device) (\n irate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m])\n)', 1, 'time_series', '{{device}}.tx'), + ] + ) + .addSeriesOverride( + { alias: '/.*tx/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + 'Network drop rate', + '', + 'null', + 'pps', + 'Send (-) / Receive (+)', + 'irate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', + '{{device}}.rx', + 21, + 1, + 3, + 5 + ) + .addTargets( + [ + u.addTargetSchema( + 'irate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', 1, 'time_series', '{{device}}.tx' + ), + ] + ) + .addSeriesOverride( + { + alias: '/.*tx/', + transform: 'negative-Y', + } + ), + HostDetailsSingleStatPanel( + 'bytes', + 'Raw Capacity', + 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.', + 'current', + 'sum(ceph_osd_stat_bytes and on (ceph_daemon) ceph_disk_occupation{instance=~"($ceph_hosts)([\\\\.:].*)?"})', + 'time_series', + 0, + 6, + 3, + 5 + ), + HostDetailsGraphPanel( + {}, + 'Network error rate', + '', + 'null', + 'pps', + 'Send (-) / Receive (+)', + 'irate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', + '{{device}}.rx', + 21, + 6, + 3, + 5 + ) + .addTargets( + [u.addTargetSchema( + 'irate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', 1, 'time_series', '{{device}}.tx' + )] + ) + .addSeriesOverride( + { + alias: '/.*tx/', + transform: 'negative-Y', + } + ), + u.addRowSchema(false, + true, + 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } }, + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk IOPS', + "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", + 'connected', + 'ops', + 'Read (-) / Write (+)', + 'label_replace(\n (\n irate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )', + '{{device}}({{ceph_daemon}}) writes', + 0, + 12, + 11, + 9 + ) + .addTargets( + [ + u.addTargetSchema( + 'label_replace(\n (irate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )', + 1, + 'time_series', + '{{device}}({{ceph_daemon}}) reads' + ), + ] + ) + .addSeriesOverride( + { alias: '/.*reads/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Throughput by Disk', + 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id', + 'connected', + 'Bps', + 'Read (-) / Write (+)', + 'label_replace((irate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{device}}({{ceph_daemon}}) write', + 12, + 12, + 11, + 9 + ) + .addTargets( + [u.addTargetSchema( + 'label_replace((irate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + 1, + 'time_series', + '{{device}}({{ceph_daemon}}) read' + )] + ) + .addSeriesOverride( + { alias: '/.*read/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk Latency', + "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", + 'null as zero', + 's', + '', + 'max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001), "instance", "$1", "instance", "([^:.]*).*")) * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{device}}({{ceph_daemon}})', + 0, + 21, + 11, + 9 + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk utilization', + 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.', + 'connected', + 'percent', + '%Util', + 'label_replace(((irate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) * 100), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{device}}({{ceph_daemon}})', + 12, + 21, + 11, + 9 + ), + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet new file mode 100644 index 0000000000000..8b425fb395a37 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -0,0 +1,533 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'osds-overview.json': + local OsdOverviewStyle(alias, pattern, type, unit) = + u.addStyle(alias, null, [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []); + local OsdOverviewGraphPanel(alias, + title, + description, + formatY1, + labelY1, + min, + expr, + legendFormat1, + x, + y, + w, + h) = + u.graphPanelSchema(alias, + title, + description, + 'null', + false, + formatY1, + 'short', + labelY1, + null, + min, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, 1, 'time_series', legendFormat1)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + local OsdOverviewPieChartPanel(alias, description, title) = + u.addPieChartSchema(alias, + '$datasource', + description, + 'Under graph', + 'pie', + title, + 'current'); + local OsdOverviewSingleStatPanel(colors, + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema( + colors, + '$datasource', + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds + ) + .addTarget( + u.addTargetSchema(expr, 1, targetFormat, '') + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'OSD Overview', + '', + 'lo02I1Aiz', + 'now-1h', + '10s', + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='table', name='Table', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addPanels([ + OsdOverviewGraphPanel( + { '@95%ile': '#e0752d' }, + 'OSD Read Latencies', + '', + 'ms', + null, + '0', + 'avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', + 'AVG read', + 0, + 0, + 8, + 8 + ) + .addTargets( + [ + u.addTargetSchema( + 'max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', + 1, + 'time_series', + 'MAX read' + ), + u.addTargetSchema( + 'quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)', 1, 'time_series', '@95%ile' + ), + ], + ), + u.addTableSchema( + '$datasource', + "This table shows the osd's that are delivering the 10 highest read latencies within the cluster", + { col: 2, desc: true }, + [ + OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'), + OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'), + OsdOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest READ Latencies', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n ))\n)\n\n', 1, 'table', '' + ) + ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } }, + OsdOverviewGraphPanel( + { + '@95%ile write': '#e0752d', + }, + 'OSD Write Latencies', + '', + 'ms', + null, + '0', + 'avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)', + 'AVG write', + 12, + 0, + 8, + 8 + ) + .addTargets( + [ + u.addTargetSchema( + 'max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)', + 1, + 'time_series', + 'MAX write' + ), + u.addTargetSchema( + 'quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)', 1, 'time_series', '@95%ile write' + ), + ], + ), + u.addTableSchema( + '$datasource', + "This table shows the osd's that are delivering the 10 highest write latencies within the cluster", + { col: 2, desc: true }, + [ + OsdOverviewStyle( + 'OSD ID', 'ceph_daemon', 'string', 'short' + ), + OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'), + OsdOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest WRITE Latencies', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n ))\n)\n\n', + 1, + 'table', + '' + ) + ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + {}, '', 'OSD Types Summary' + ) + .addTarget( + u.addTargetSchema('count by (device_class) (ceph_osd_metadata)', 1, 'time_series', '{{device_class}}') + ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types' + ) + .addTarget( + u.addTargetSchema( + 'count(ceph_bluefs_wal_total_bytes)', 1, 'time_series', 'bluestore' + ) + ) + .addTarget( + u.addTargetSchema( + 'absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)', 1, 'time_series', 'filestore' + ) + ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary' + ) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes < 1099511627776)', 1, 'time_series', '<1TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)', 1, 'time_series', '<2TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)', 1, 'time_series', '<3TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)', 1, 'time_series', '<4TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)', 1, 'time_series', '<6TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)', 1, 'time_series', '<8TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)', 1, 'time_series', '<10TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)', 1, 'time_series', '<12TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 13194139533312)', 1, 'time_series', '<12TB+' + )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } }, + g.graphPanel.new(bars=true, + datasource='$datasource', + title='Distribution of PGs per OSD', + x_axis_buckets=20, + x_axis_mode='histogram', + x_axis_values=['total'], + formatY1='short', + formatY2='short', + labelY1='# of OSDs', + min='0', + nullPointMode='null') + .addTarget(u.addTargetSchema( + 'ceph_osd_numpg\n', 1, 'time_series', 'PGs per OSD' + )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } }, + OsdOverviewSingleStatPanel( + ['#d44a3a', '#299c46'], + 'percentunit', + 'OSD onode Hits Ratio', + 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster', + 'current', + true, + 1, + true, + false, + '.75', + 'sum(ceph_bluestore_onode_hits)/(sum(ceph_bluestore_onode_hits) + sum(ceph_bluestore_onode_misses))', + 'time_series', + 20, + 8, + 4, + 8 + ), + u.addRowSchema(false, + true, + 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } }, + OsdOverviewGraphPanel( + {}, + 'Read/Write Profile', + 'Show the read/write workload profile overtime', + 'short', + null, + null, + 'round(sum(irate(ceph_pool_rd[30s])))', + 'Reads', + 0, + 17, + 24, + 8 + ) + .addTargets([u.addTargetSchema( + 'round(sum(irate(ceph_pool_wr[30s])))', 1, 'time_series', 'Writes' + )]), + ]), + 'osd-device-details.json': + local OsdDeviceDetailsPanel(title, + description, + formatY1, + labelY1, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + u.graphPanelSchema({}, + title, + description, + 'null', + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [ + u.addTargetSchema(expr1, + 1, + 'time_series', + legendFormat1), + u.addTargetSchema(expr2, 1, 'time_series', legendFormat2), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'OSD device details', + '', + 'CrAHE0iZz', + 'now-3h', + '', + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('osd', + '$datasource', + 'label_values(ceph_osd_metadata,ceph_daemon)', + 1, + false, + 1, + 'OSD', + '(.*)') + ) + .addPanels([ + u.addRowSchema( + false, true, 'OSD Performance' + ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + OsdDeviceDetailsPanel( + '$osd Latency', + '', + 's', + 'Read (-) / Write (+)', + 'irate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])', + 'irate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])', + 'read', + 'write', + 0, + 1, + 6, + 9 + ) + .addSeriesOverride( + { + alias: 'read', + transform: 'negative-Y', + } + ), + OsdDeviceDetailsPanel( + '$osd R/W IOPS', + '', + 'short', + 'Read (-) / Write (+)', + 'irate(ceph_osd_op_r{ceph_daemon=~"$osd"}[1m])', + 'irate(ceph_osd_op_w{ceph_daemon=~"$osd"}[1m])', + 'Reads', + 'Writes', + 6, + 1, + 6, + 9 + ) + .addSeriesOverride( + { alias: 'Reads', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + '$osd R/W Bytes', + '', + 'bytes', + 'Read (-) / Write (+)', + 'irate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd"}[1m])', + 'irate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd"}[1m])', + 'Read Bytes', + 'Write Bytes', + 12, + 1, + 6, + 9 + ) + .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }), + u.addRowSchema( + false, true, 'Physical Device Performance' + ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } }, + OsdDeviceDetailsPanel( + 'Physical Device Latency for $osd', + '', + 's', + 'Read (-) / Write (+)', + '(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))', + '(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))', + '{{instance}}/{{device}} Reads', + '{{instance}}/{{device}} Writes', + 0, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + 'Physical Device R/W IOPS for $osd', + '', + 'short', + 'Read (-) / Write (+)', + 'label_replace(irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + 'label_replace(irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{device}} on {{instance}} Writes', + '{{device}} on {{instance}} Reads', + 6, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + 'Physical Device R/W Bytes for $osd', + '', + 'Bps', + 'Read (-) / Write (+)', + 'label_replace(irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + 'label_replace(irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{instance}} {{device}} Reads', + '{{instance}} {{device}} Writes', + 12, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + u.graphPanelSchema( + {}, + 'Physical Device Util% for $osd', + '', + 'null', + false, + 'percentunit', + 'short', + null, + null, + null, + 1, + '$datasource' + ) + .addTarget(u.addTargetSchema( + 'label_replace(irate(node_disk_io_time_seconds_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + 1, + 'time_series', + '{{device}} on {{instance}}' + )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } }, + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/pool.libsonnet b/monitoring/ceph-mixin/dashboards/pool.libsonnet new file mode 100644 index 0000000000000..527c9124ba2c5 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/pool.libsonnet @@ -0,0 +1,570 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'pool-overview.json': + local PoolOverviewSingleStatPanel(format, + title, + description, + valueName, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget(u.addTargetSchema(expr, 1, targetFormat, '')) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local PoolOverviewStyle(alias, + pattern, + type, + unit, + colorMode, + thresholds, + valueMaps) = + u.addStyle(alias, + colorMode, + [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + 'YYYY-MM-DD HH:mm:ss', + 2, + 1, + pattern, + thresholds, + type, + unit, + valueMaps); + + local PoolOverviewGraphPanel(title, + description, + formatY1, + labelY1, + expr, + targetFormat, + legendFormat, + x, + y, + w, + h) = + u.graphPanelSchema({}, + title, + description, + 'null as zero', + false, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, + 1, + 'time_series', + legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'Ceph Pools Overview', + '', + 'z99hzWtmk', + 'now-1h', + '15s', + 22, + [], + '', + { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'] } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'Dashboard1', + label='Data Source') + ) + .addTemplate( + g.template.custom(label='TopK', + name='topk', + current='15', + query='15') + ) + .addPanels([ + PoolOverviewSingleStatPanel( + 'none', + 'Pools', + '', + 'avg', + 'count(ceph_pool_metadata)', + 'table', + 0, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'none', + 'Pools with Compression', + 'Count of the pools that have compression enabled', + 'current', + 'count(ceph_pool_metadata{compression_mode!="none"})', + '', + 3, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Total Raw Capacity', + 'Total raw capacity available to the cluster', + 'current', + 'sum(ceph_osd_stat_bytes)', + '', + 6, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Raw Capacity Consumed', + 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)', + 'current', + 'sum(ceph_pool_bytes_used)', + '', + 9, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Logical Stored ', + 'Total of client data stored in the cluster', + 'current', + 'sum(ceph_pool_stored)', + '', + 12, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Compression Savings', + 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression', + 'current', + 'sum(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used)', + '', + 15, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'percent', + 'Compression Eligibility', + 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data\n', + 'current', + '(sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_stored_raw and ceph_pool_compress_under_bytes > 0)) * 100', + 'table', + 18, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'none', + 'Compression Factor', + 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)', + 'current', + 'sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_compress_bytes_used > 0)', + '', + 21, + 0, + 3, + 3 + ), + u.addTableSchema( + '$datasource', + '', + { col: 5, desc: true }, + [ + PoolOverviewStyle('', 'Time', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'instance', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'job', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Pool Name', 'name', 'string', 'short', null, [], []), + PoolOverviewStyle('Pool ID', 'pool_id', 'hidden', 'none', null, [], []), + PoolOverviewStyle('Compression Factor', 'Value #A', 'number', 'none', null, [], []), + PoolOverviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70', '85'], []), + PoolOverviewStyle('Usable Free', 'Value #B', 'number', 'bytes', null, [], []), + PoolOverviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent', null, [], []), + PoolOverviewStyle('Compression Savings', 'Value #E', 'number', 'bytes', null, [], []), + PoolOverviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0'], []), + PoolOverviewStyle('IOPS', 'Value #G', 'number', 'none', null, [], []), + PoolOverviewStyle('Bandwidth', 'Value #H', 'number', 'Bps', null, [], []), + PoolOverviewStyle('', '__name__', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'type', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'compression_mode', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Type', 'description', 'string', 'short', null, [], []), + PoolOverviewStyle('Stored', 'Value #J', 'number', 'bytes', null, [], []), + PoolOverviewStyle('', 'Value #I', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{ text: 'ON', value: '1' }]), + ], + 'Pool Overview', + 'table' + ) + .addTargets( + [ + u.addTargetSchema( + '(ceph_pool_compress_under_bytes / ceph_pool_compress_bytes_used > 0) and on(pool_id) (((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100 > 0.5)', + 1, + 'table', + 'A' + ), + u.addTargetSchema( + 'ceph_pool_max_avail * on(pool_id) group_left(name) ceph_pool_metadata', + 1, + 'table', + 'B' + ), + u.addTargetSchema( + '((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100', + 1, + 'table', + 'C' + ), + u.addTargetSchema( + '(ceph_pool_percent_used * on(pool_id) group_left(name) ceph_pool_metadata)', + 1, + 'table', + 'D' + ), + u.addTargetSchema( + '(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used > 0)', + 1, + 'table', + 'E' + ), + u.addTargetSchema( + 'delta(ceph_pool_stored[5d])', 1, 'table', 'F' + ), + u.addTargetSchema( + 'rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])', + 1, + 'table', + 'G' + ), + u.addTargetSchema( + 'rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])', + 1, + 'table', + 'H' + ), + u.addTargetSchema( + 'ceph_pool_metadata', 1, 'table', 'I' + ), + u.addTargetSchema( + 'ceph_pool_stored * on(pool_id) group_left ceph_pool_metadata', + 1, + 'table', + 'J' + ), + u.addTargetSchema( + 'ceph_pool_metadata{compression_mode!="none"}', 1, 'table', 'K' + ), + u.addTargetSchema('', '', '', 'L'), + ] + ) + { gridPos: { x: 0, y: 3, w: 24, h: 6 } }, + PoolOverviewGraphPanel( + 'Top $topk Client IOPS by Pool', + 'This chart shows the sum of read and write IOPS from all clients by pool', + 'short', + 'IOPS', + 'topk($topk,round((rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])),1) * on(pool_id) group_left(instance,name) ceph_pool_metadata) ', + 'time_series', + '{{name}} ', + 0, + 9, + 12, + 8 + ) + .addTarget( + u.addTargetSchema( + 'topk($topk,rate(ceph_pool_wr[30s]) + on(pool_id) group_left(instance,name) ceph_pool_metadata) ', + 1, + 'time_series', + '{{name}} - write' + ) + ), + PoolOverviewGraphPanel( + 'Top $topk Client Bandwidth by Pool', + 'The chart shows the sum of read and write bytes from all clients, by pool', + 'Bps', + 'Throughput', + 'topk($topk,(rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])) * on(pool_id) group_left(instance,name) ceph_pool_metadata)', + 'time_series', + '{{name}}', + 12, + 9, + 12, + 8 + ), + PoolOverviewGraphPanel( + 'Pool Capacity Usage (RAW)', + 'Historical view of capacity usage, to help identify growth and trends in pool consumption', + 'bytes', + 'Capacity Used', + 'ceph_pool_bytes_used * on(pool_id) group_right ceph_pool_metadata', + '', + '{{name}}', + 0, + 17, + 24, + 7 + ), + ]), + 'pool-detail.json': + local PoolDetailSingleStatPanel(format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds) + .addTarget(u.addTargetSchema(expr, 1, targetFormat, '')) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local PoolDetailGraphPanel(alias, + title, + description, + formatY1, + labelY1, + expr, + targetFormat, + legendFormat, + x, + y, + w, + h) = + u.graphPanelSchema(alias, + title, + description, + 'null as zero', + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, 1, 'time_series', legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'Ceph Pool Details', + '', + '-xyV8KCiz', + 'now-1h', + '15s', + 22, + [], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'Prometheus admin.virt1.home.fajerski.name:9090', + label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('pool_name', + '$datasource', + 'label_values(ceph_pool_metadata,name)', + 1, + false, + 1, + 'Pool Name', + '') + ) + .addPanels([ + PoolDetailSingleStatPanel( + 'percentunit', + 'Capacity used', + '', + 'current', + true, + 1, + true, + true, + '.7,.8', + '(ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 'time_series', + 0, + 0, + 7, + 7 + ), + PoolDetailSingleStatPanel( + 's', + 'Time till full', + 'Time till pool is full assuming the average fill rate of the last 6 hours', + false, + 100, + false, + false, + '', + 'current', + '(ceph_pool_max_avail / deriv(ceph_pool_stored[6h])) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} > 0', + 'time_series', + 7, + 0, + 5, + 7 + ), + PoolDetailGraphPanel( + { + read_op_per_sec: + '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Object Ingress/Egress', + '', + 'ops', + 'Objects out(-) / in(+) ', + 'deriv(ceph_pool_objects[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 'time_series', + 'Objects per second', + 12, + 0, + 12, + 7 + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, '$pool_name Client IOPS', '', 'iops', 'Read (-) / Write (+)', 'irate(ceph_pool_rd[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', 'time_series', 'reads', 0, 7, 12, 7 + ) + .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) + .addTarget( + u.addTargetSchema( + 'irate(ceph_pool_wr[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', 1, 'time_series', 'writes' + ) + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Client Throughput', + '', + 'Bps', + 'Read (-) / Write (+)', + 'irate(ceph_pool_rd_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 'time_series', + 'reads', + 12, + 7, + 12, + 7 + ) + .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) + .addTarget( + u.addTargetSchema( + 'irate(ceph_pool_wr_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 1, + 'time_series', + 'writes' + ) + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Objects', + '', + 'short', + 'Objects', + 'ceph_pool_objects * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 'time_series', + 'Number of Objects', + 0, + 14, + 12, + 7 + ), + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/rbd.libsonnet b/monitoring/ceph-mixin/dashboards/rbd.libsonnet new file mode 100644 index 0000000000000..d464f889f54db --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/rbd.libsonnet @@ -0,0 +1,309 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'rbd-details.json': + local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) = + u.graphPanelSchema({}, + title, + '', + 'null as zero', + false, + formatY1, + formatY1, + null, + null, + 0, + 1, + '$Datasource') + .addTargets( + [ + u.addTargetSchema(expr1, + 1, + 'time_series', + '{{pool}} Write'), + u.addTargetSchema(expr2, 1, 'time_series', '{{pool}} Read'), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RBD Details', + 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)', + 'YhCYGcuZz', + 'now-1h', + false, + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('Datasource', 'prometheus', 'default', label=null) + ) + .addTemplate( + u.addTemplateSchema('Pool', + '$Datasource', + 'label_values(pool)', + 1, + false, + 0, + '', + '') + ) + .addTemplate( + u.addTemplateSchema('Image', + '$Datasource', + 'label_values(image)', + 1, + false, + 0, + '', + '') + ) + .addPanels([ + RbdDetailsPanel( + 'IOPS', + 'iops', + 'irate(ceph_rbd_write_ops{pool="$Pool", image="$Image"}[30s])', + 'irate(ceph_rbd_read_ops{pool="$Pool", image="$Image"}[30s])', + 0, + 0, + 8, + 9 + ), + RbdDetailsPanel( + 'Throughput', + 'Bps', + 'irate(ceph_rbd_write_bytes{pool="$Pool", image="$Image"}[30s])', + 'irate(ceph_rbd_read_bytes{pool="$Pool", image="$Image"}[30s])', + 8, + 0, + 8, + 9 + ), + RbdDetailsPanel( + 'Average Latency', + 'ns', + 'irate(ceph_rbd_write_latency_sum{pool="$Pool", image="$Image"}[30s]) / irate(ceph_rbd_write_latency_count{pool="$Pool", image="$Image"}[30s])', + 'irate(ceph_rbd_read_latency_sum{pool="$Pool", image="$Image"}[30s]) / irate(ceph_rbd_read_latency_count{pool="$Pool", image="$Image"}[30s])', + 16, + 0, + 8, + 9 + ), + ]), + 'rbd-overview.json': + local RgwOverviewStyle(alias, pattern, type, unit) = + u.addStyle(alias, + null, + ['rgba(245, 54, 54, 0.9)', 'rgba(237, 129, 40, 0.89)', 'rgba(50, 172, 45, 0.97)'], + 'YYYY-MM-DD HH:mm:ss', + 2, + 1, + pattern, + [], + type, + unit, + []); + local RbdOverviewPanel(title, + formatY1, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + u.graphPanelSchema({}, + title, + '', + 'null', + false, + formatY1, + 'short', + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + u.addTargetSchema(expr1, + 1, + 'time_series', + legendFormat1), + u.addTargetSchema(expr2, + 1, + 'time_series', + legendFormat2), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RBD Overview', + '', + '41FrpeUiz', + 'now-1h', + '30s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.4.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='datasource', id='prometheus', name='Prometheus', version='5.0.0' + ) + .addRequired( + type='panel', id='table', name='Table', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addPanels([ + RbdOverviewPanel( + 'IOPS', + 'short', + 'round(sum(irate(ceph_rbd_write_ops[30s])))', + 'round(sum(irate(ceph_rbd_read_ops[30s])))', + 'Writes', + 'Reads', + 0, + 0, + 8, + 7 + ), + RbdOverviewPanel( + 'Throughput', + 'Bps', + 'round(sum(irate(ceph_rbd_write_bytes[30s])))', + 'round(sum(irate(ceph_rbd_read_bytes[30s])))', + 'Write', + 'Read', + 8, + 0, + 8, + 7 + ), + RbdOverviewPanel( + 'Average Latency', + 'ns', + 'round(sum(irate(ceph_rbd_write_latency_sum[30s])) / sum(irate(ceph_rbd_write_latency_count[30s])))', + 'round(sum(irate(ceph_rbd_read_latency_sum[30s])) / sum(irate(ceph_rbd_read_latency_count[30s])))', + 'Write', + 'Read', + 16, + 0, + 8, + 7 + ), + u.addTableSchema( + '$datasource', + '', + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('IOPS', 'Value', 'number', 'iops'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest IOPS', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10, (sort((irate(ceph_rbd_write_ops[30s]) + on (image, pool, namespace) irate(ceph_rbd_read_ops[30s])))))', + 1, + 'table', + '' + ) + ) + { gridPos: { x: 0, y: 7, w: 8, h: 7 } }, + u.addTableSchema( + '$datasource', + '', + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('Throughput', 'Value', 'number', 'Bps'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest Throughput', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10, sort(sum(irate(ceph_rbd_read_bytes[30s]) + irate(ceph_rbd_write_bytes[30s])) by (pool, image, namespace)))', + 1, + 'table', + '' + ) + ) + { gridPos: { x: 8, y: 7, w: 8, h: 7 } }, + u.addTableSchema( + '$datasource', + '', + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('Latency', 'Value', 'number', 'ns'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest Latency', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10,\n sum(\n irate(ceph_rbd_write_latency_sum[30s]) / clamp_min(irate(ceph_rbd_write_latency_count[30s]), 1) +\n irate(ceph_rbd_read_latency_sum[30s]) / clamp_min(irate(ceph_rbd_read_latency_count[30s]), 1)\n ) by (pool, image, namespace)\n)', + 1, + 'table', + '' + ) + ) + { gridPos: { x: 16, y: 7, w: 8, h: 7 } }, + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet new file mode 100644 index 0000000000000..e0ad25fb59b24 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -0,0 +1,643 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'radosgw-sync-overview.json': + local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) = + u.graphPanelSchema({}, + title, + '', + 'null as zero', + true, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric, + 1, + 'time_series', + '{{source_zone}}')] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RGW Sync Overview', + '', + 'rgw-sync-overview', + 'now-1h', + '15s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + u.addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '') + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addPanels([ + RgwSyncOverviewPanel( + 'Replication (throughput) from Source Zone', + 'Bps', + null, + 'ceph_data_sync_from_zone_fetch_bytes_sum', + 0, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Replication (objects) from Source Zone', + 'short', + 'Objects/s', + 'ceph_data_sync_from_zone_fetch_bytes_count', + 8, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Polling Request Latency from Source Zone', + 'ms', + null, + 'ceph_data_sync_from_zone_poll_latency_sum', + 16, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Unsuccessful Object Replications from Source Zone', + 'short', + 'Count/s', + 'ceph_data_sync_from_zone_fetch_errors', + 0, + 7, + 8, + 7 + ), + ]), + 'radosgw-overview.json': + local RgwOverviewPanel( + title, + description, + formatY1, + formatY2, + expr1, + legendFormat1, + x, + y, + w, + h, + datasource='$datasource', + legend_alignAsTable=false, + legend_avg=false, + legend_min=false, + legend_max=false, + legend_current=false, + legend_values=false + ) = + u.graphPanelSchema( + {}, + title, + description, + 'null', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + datasource, + legend_alignAsTable, + legend_avg, + legend_min, + legend_max, + legend_current, + legend_values + ) + .addTargets( + [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RGW Overview', + '', + 'WAkugZpiz', + 'now-1h', + '15s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + u.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata, ceph_daemon)', + 1, + true, + 1, + '', + '' + ) + ) + .addTemplate( + u.addTemplateSchema( + 'code', + '$datasource', + 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)', + 1, + true, + 1, + 'HTTP Code', + '' + ) + ) + .addTemplate( + u.addTemplateSchema( + 'ingress_service', + '$datasource', + 'label_values(haproxy_server_status, instance)', + 1, + true, + 1, + 'Ingress Service', + '' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addPanels([ + u.addRowSchema(false, + true, + 'RGW Overview - All Gateways') + + { + gridPos: { x: 0, y: 0, w: 24, h: 1 }, + }, + RgwOverviewPanel( + 'Average GET/PUT Latencies', + '', + 's', + 'short', + 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', + 'GET AVG', + 0, + 1, + 8, + 7 + ).addTargets( + [ + u.addTargetSchema( + 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', + 1, + 'time_series', + 'PUT AVG' + ), + ] + ), + RgwOverviewPanel( + 'Total Requests/sec by RGW Instance', + '', + 'none', + 'short', + 'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))', + '{{rgw_host}}', + 8, + 1, + 7, + 7 + ), + RgwOverviewPanel( + 'GET Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', + 's', + 'short', + 'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', + '{{rgw_host}}', + 15, + 1, + 6, + 7 + ), + RgwOverviewPanel( + 'Bandwidth Consumed by Type', + 'Total bytes transferred in/out of all radosgw instances within the cluster', + 'bytes', + 'short', + 'sum(rate(ceph_rgw_get_b[30s]))', + 'GETs', + 0, + 8, + 8, + 6 + ).addTargets( + [u.addTargetSchema('sum(rate(ceph_rgw_put_b[30s]))', + 1, + 'time_series', + 'PUTs')] + ), + RgwOverviewPanel( + 'Bandwidth by RGW Instance', + 'Total bytes transferred in/out through get/put operations, by radosgw instance', + 'bytes', + 'short', + 'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', + '{{rgw_host}}', + 8, + 8, + 7, + 6 + ), + RgwOverviewPanel( + 'PUT Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', + 's', + 'short', + 'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', + '{{rgw_host}}', + 15, + 8, + 6, + 6 + ), + u.addRowSchema( + false, true, 'RGW Overview - HAProxy Metrics' + ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } }, + RgwOverviewPanel( + 'Total responses by HTTP code', + '', + 'short', + 'short', + 'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)', + 'Frontend {{ code }}', + 0, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [u.addTargetSchema('sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', 1, 'time_series', 'Backend {{ code }}')] + ) + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + { alias: '/.*1.*/' }, + { alias: '/.*2.*/' }, + { alias: '/.*3.*/' }, + { alias: '/.*4.*/' }, + { alias: '/.*5.*/' }, + { alias: '/.*other.*/' }, + ]), + RgwOverviewPanel( + 'Total requests / responses', + '', + 'short', + 'short', + 'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', + 'Requests', + 5, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + u.addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'), + u.addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'), + u.addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'), + u.addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'), + u.addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'), + u.addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Response.*/', + transform: 'negative-Y', + }, + { + alias: '/.*Backend.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Total number of connections', + '', + 'short', + 'short', + 'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', + 'Front', + 10, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + u.addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'), + u.addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Current total of incoming / outgoing bytes', + '', + 'short', + 'short', + 'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', + 'IN Front', + 15, + 12, + 6, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + u.addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'), + u.addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'), + u.addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back'), + ] + ) + .addSeriesOverride([ + { + alias: '/.*OUT.*/', + transform: 'negative-Y', + }, + ]), + ]), + 'radosgw-detail.json': + local RgwDetailsPanel(aliasColors, + title, + description, + formatY1, + formatY2, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + u.graphPanelSchema(aliasColors, + title, + description, + 'null', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1), u.addTargetSchema(expr2, 1, 'time_series', legendFormat2)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RGW Instance Detail', + '', + 'x5ARzZtmk', + 'now-1h', + '15s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', + id='grafana-piechart-panel', + name='Pie Chart', + version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata, ceph_daemon)', + 1, + true, + 1, + '', + '') + ) + .addPanels([ + u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + RgwDetailsPanel( + {}, + '$rgw_servers GET/PUT Latencies', + '', + 's', + 'short', + 'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'GET {{ceph_daemon}}', + 'PUT {{ceph_daemon}}', + 0, + 1, + 6, + 8 + ), + RgwDetailsPanel( + {}, + 'Bandwidth by HTTP Operation', + '', + 'bytes', + 'short', + 'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'GETs {{ceph_daemon}}', + 'PUTs {{ceph_daemon}}', + 6, + 1, + 7, + 8 + ), + RgwDetailsPanel( + { + GETs: '#7eb26d', + Other: '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + 'Requests Failed': '#bf1b00', + }, + 'HTTP Request Breakdown', + '', + 'short', + 'short', + 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'Requests Failed {{ceph_daemon}}', + 'GETs {{ceph_daemon}}', + 13, + 1, + 7, + 8 + ) + .addTargets( + [ + u.addTargetSchema( + 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'PUTs {{ceph_daemon}}' + ), + u.addTargetSchema( + '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'Other {{ceph_daemon}}' + ), + ] + ), + u.addPieChartSchema( + { + GETs: '#7eb26d', + 'Other (HEAD,POST,DELETE)': '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + Failures: '#bf1b00', + }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current' + ) + .addTarget(u.addTargetSchema( + 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'Failures {{ceph_daemon}}' + )) + .addTarget(u.addTargetSchema( + 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'GETs {{ceph_daemon}}' + )) + .addTarget(u.addTargetSchema( + 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'PUTs {{ceph_daemon}}' + )) + .addTarget(u.addTargetSchema( + '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'Other (DELETE,LIST) {{ceph_daemon}}' + )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } }, + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/utils.libsonnet b/monitoring/ceph-mixin/dashboards/utils.libsonnet new file mode 100644 index 0000000000000..f31c0ffe57853 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/utils.libsonnet @@ -0,0 +1,172 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +{ + dashboardSchema(title, + description, + uid, + time_from, + refresh, + schemaVersion, + tags, + timezone, + timepicker):: + g.dashboard.new(title=title, + description=description, + uid=uid, + time_from=time_from, + refresh=refresh, + schemaVersion=schemaVersion, + tags=tags, + timezone=timezone, + timepicker=timepicker), + + graphPanelSchema(aliasColors, + title, + description, + nullPointMode, + stack, + formatY1, + formatY2, + labelY1, + labelY2, + min, + fill, + datasource, + legend_alignAsTable=false, + legend_avg=false, + legend_min=false, + legend_max=false, + legend_current=false, + legend_values=false):: + g.graphPanel.new(aliasColors=aliasColors, + title=title, + description=description, + nullPointMode=nullPointMode, + stack=stack, + formatY1=formatY1, + formatY2=formatY2, + labelY1=labelY1, + labelY2=labelY2, + min=min, + fill=fill, + datasource=datasource, + legend_alignAsTable=legend_alignAsTable, + legend_avg=legend_avg, + legend_min=legend_min, + legend_max=legend_max, + legend_current=legend_current, + legend_values=legend_values), + + + addTargetSchema(expr, intervalFactor, format, legendFormat):: + g.prometheus.target(expr=expr, + intervalFactor=intervalFactor, + format=format, + legendFormat=legendFormat), + + addTemplateSchema(name, + datasource, + query, + refresh, + includeAll, + sort, + label, + regex):: + g.template.new(name=name, + datasource=datasource, + query=query, + refresh=refresh, + includeAll=includeAll, + sort=sort, + label=label, + regex=regex), + + addAnnotationSchema(builtIn, + datasource, + enable, + hide, + iconColor, + name, + type):: + g.annotation.datasource(builtIn=builtIn, + datasource=datasource, + enable=enable, + hide=hide, + iconColor=iconColor, + name=name, + type=type), + + addRowSchema(collapse, showTitle, title):: + g.row.new(collapse=collapse, showTitle=showTitle, title=title), + + addSingleStatSchema(colors, + datasource, + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparklineShow, + thresholds):: + g.singlestat.new(colors=colors, + datasource=datasource, + format=format, + title=title, + description=description, + valueName=valueName, + colorValue=colorValue, + gaugeMaxValue=gaugeMaxValue, + gaugeShow=gaugeShow, + sparklineShow=sparklineShow, + thresholds=thresholds), + + addPieChartSchema(aliasColors, + datasource, + description, + legendType, + pieType, + title, + valueName):: + g.pieChartPanel.new(aliasColors=aliasColors, + datasource=datasource, + description=description, + legendType=legendType, + pieType=pieType, + title=title, + valueName=valueName), + + addTableSchema(datasource, description, sort, styles, title, transform):: + g.tablePanel.new(datasource=datasource, + description=description, + sort=sort, + styles=styles, + title=title, + transform=transform), + + addStyle(alias, + colorMode, + colors, + dateFormat, + decimals, + mappingType, + pattern, + thresholds, + type, + unit, + valueMaps):: + { + alias: alias, + colorMode: colorMode, + colors: colors, + dateFormat: dateFormat, + decimals: decimals, + mappingType: mappingType, + pattern: pattern, + thresholds: thresholds, + type: type, + unit: unit, + valueMaps: valueMaps, + }, +} diff --git a/monitoring/grafana/dashboards/ceph-cluster.json b/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json similarity index 100% rename from monitoring/grafana/dashboards/ceph-cluster.json rename to monitoring/ceph-mixin/dashboards_out/ceph-cluster.json diff --git a/monitoring/grafana/dashboards/cephfs-overview.json b/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json similarity index 99% rename from monitoring/grafana/dashboards/cephfs-overview.json rename to monitoring/ceph-mixin/dashboards_out/cephfs-overview.json index 91a37f0807be3..5c0c27329d69a 100644 --- a/monitoring/grafana/dashboards/cephfs-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json @@ -64,6 +64,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -161,6 +162,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, diff --git a/monitoring/grafana/dashboards/host-details.json b/monitoring/ceph-mixin/dashboards_out/host-details.json similarity index 99% rename from monitoring/grafana/dashboards/host-details.json rename to monitoring/ceph-mixin/dashboards_out/host-details.json index 72014860e0487..7b3c1df152efa 100644 --- a/monitoring/grafana/dashboards/host-details.json +++ b/monitoring/ceph-mixin/dashboards_out/host-details.json @@ -157,6 +157,7 @@ "datasource": "$datasource", "description": "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 10, "w": 6, @@ -249,6 +250,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 10, "w": 6, @@ -363,6 +365,7 @@ "datasource": "$datasource", "description": "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 10, "w": 6, @@ -460,6 +463,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 5, "w": 3, @@ -639,6 +643,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 5, "w": 3, @@ -755,6 +760,7 @@ "datasource": "$datasource", "description": "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 11, @@ -852,6 +858,7 @@ "datasource": "$datasource", "description": "For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 11, @@ -949,6 +956,7 @@ "datasource": "$datasource", "description": "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 11, @@ -1034,6 +1042,7 @@ "datasource": "$datasource", "description": "Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 11, diff --git a/monitoring/grafana/dashboards/hosts-overview.json b/monitoring/ceph-mixin/dashboards_out/hosts-overview.json similarity index 93% rename from monitoring/grafana/dashboards/hosts-overview.json rename to monitoring/ceph-mixin/dashboards_out/hosts-overview.json index 758d278cbc6ed..462ddf37bda44 100644 --- a/monitoring/grafana/dashboards/hosts-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/hosts-overview.json @@ -514,7 +514,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum (\n\t(\n\t\tirate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\t\tirate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n\t(\n\t\tirate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\t\tirate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n\t)\n", + "expr": "sum (\n (\n irate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n (\n irate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n )\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "", @@ -542,6 +542,7 @@ "datasource": "$datasource", "description": "Show the top 10 busiest hosts by cpu", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -627,6 +628,7 @@ "datasource": "$datasource", "description": "Top 10 hosts by network load", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -661,7 +663,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, (sum by(instance) (\n(\n\tirate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\tirate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) +\n(\n\tirate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\tirate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", + "expr": "topk(10, (sum by(instance) (\n(\n irate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) +\n(\n irate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", diff --git a/monitoring/grafana/dashboards/osd-device-details.json b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json similarity index 99% rename from monitoring/grafana/dashboards/osd-device-details.json rename to monitoring/ceph-mixin/dashboards_out/osd-device-details.json index 3c62d179f295e..3b45dc967a53a 100644 --- a/monitoring/grafana/dashboards/osd-device-details.json +++ b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json @@ -64,6 +64,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -161,6 +162,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -258,6 +260,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -374,6 +377,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -471,6 +475,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -568,6 +573,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -665,6 +671,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, diff --git a/monitoring/grafana/dashboards/osds-overview.json b/monitoring/ceph-mixin/dashboards_out/osds-overview.json similarity index 99% rename from monitoring/grafana/dashboards/osds-overview.json rename to monitoring/ceph-mixin/dashboards_out/osds-overview.json index a3c93b53d1cb8..dc05689ecb4b9 100644 --- a/monitoring/grafana/dashboards/osds-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/osds-overview.json @@ -59,6 +59,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 8, @@ -161,6 +162,7 @@ "y": 0 }, "id": 3, + "links": [ ], "sort": { "col": 2, "desc": true @@ -243,6 +245,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 8, @@ -345,6 +348,7 @@ "y": 0 }, "id": 5, + "links": [ ], "sort": { "col": 2, "desc": true @@ -582,6 +586,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 8, @@ -767,6 +772,7 @@ "datasource": "$datasource", "description": "Show the read/write workload profile overtime", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 24, diff --git a/monitoring/grafana/dashboards/pool-detail.json b/monitoring/ceph-mixin/dashboards_out/pool-detail.json similarity index 99% rename from monitoring/grafana/dashboards/pool-detail.json rename to monitoring/ceph-mixin/dashboards_out/pool-detail.json index e64cc3d82b69b..9a8518e151c61 100644 --- a/monitoring/grafana/dashboards/pool-detail.json +++ b/monitoring/ceph-mixin/dashboards_out/pool-detail.json @@ -217,6 +217,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, @@ -305,6 +306,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, @@ -405,6 +407,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, @@ -505,6 +508,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, diff --git a/monitoring/grafana/dashboards/pool-overview.json b/monitoring/ceph-mixin/dashboards_out/pool-overview.json similarity index 99% rename from monitoring/grafana/dashboards/pool-overview.json rename to monitoring/ceph-mixin/dashboards_out/pool-overview.json index 50145c8ab98cc..d70d4c7ae02a8 100644 --- a/monitoring/grafana/dashboards/pool-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/pool-overview.json @@ -690,6 +690,7 @@ "y": 3 }, "id": 10, + "links": [ ], "sort": { "col": 5, "desc": true @@ -1147,6 +1148,7 @@ "datasource": "$datasource", "description": "This chart shows the sum of read and write IOPS from all clients by pool", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 12, @@ -1239,6 +1241,7 @@ "datasource": "$datasource", "description": "The chart shows the sum of read and write bytes from all clients, by pool", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 12, @@ -1324,6 +1327,7 @@ "datasource": "$datasource", "description": "Historical view of capacity usage, to help identify growth and trends in pool consumption", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 24, diff --git a/monitoring/grafana/dashboards/radosgw-detail.json b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json similarity index 99% rename from monitoring/grafana/dashboards/radosgw-detail.json rename to monitoring/ceph-mixin/dashboards_out/radosgw-detail.json index 53486475cbb51..4d68906f2ba07 100644 --- a/monitoring/grafana/dashboards/radosgw-detail.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json @@ -70,6 +70,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 6, @@ -162,6 +163,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 7, @@ -260,6 +262,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 7, diff --git a/monitoring/grafana/dashboards/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json similarity index 99% rename from monitoring/grafana/dashboards/radosgw-overview.json rename to monitoring/ceph-mixin/dashboards_out/radosgw-overview.json index 7fe94138b1356..7f9375290fc0c 100644 --- a/monitoring/grafana/dashboards/radosgw-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -64,6 +64,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -156,6 +157,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 7, @@ -241,6 +243,7 @@ "datasource": "$datasource", "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 6, @@ -326,6 +329,7 @@ "datasource": "$datasource", "description": "Total bytes transferred in/out of all radosgw instances within the cluster", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 6, "w": 8, @@ -418,6 +422,7 @@ "datasource": "$datasource", "description": "Total bytes transferred in/out through get/put operations, by radosgw instance", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 6, "w": 7, @@ -503,6 +508,7 @@ "datasource": "$datasource", "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 6, "w": 6, @@ -607,6 +613,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 12, "w": 5, @@ -724,6 +731,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 12, "w": 5, @@ -862,6 +870,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 12, "w": 5, @@ -968,6 +977,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 12, "w": 6, diff --git a/monitoring/grafana/dashboards/radosgw-sync-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json similarity index 99% rename from monitoring/grafana/dashboards/radosgw-sync-overview.json rename to monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json index 442da57590652..232242acc5860 100644 --- a/monitoring/grafana/dashboards/radosgw-sync-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json @@ -45,6 +45,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -130,6 +131,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -215,6 +217,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -300,6 +303,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, diff --git a/monitoring/grafana/dashboards/rbd-details.json b/monitoring/ceph-mixin/dashboards_out/rbd-details.json similarity index 99% rename from monitoring/grafana/dashboards/rbd-details.json rename to monitoring/ceph-mixin/dashboards_out/rbd-details.json index ea7f79aa27ff2..7a9e1b56b8fc9 100644 --- a/monitoring/grafana/dashboards/rbd-details.json +++ b/monitoring/ceph-mixin/dashboards_out/rbd-details.json @@ -45,6 +45,7 @@ "datasource": "$Datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 8, @@ -137,6 +138,7 @@ "datasource": "$Datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 8, @@ -229,6 +231,7 @@ "datasource": "$Datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 8, diff --git a/monitoring/grafana/dashboards/rbd-overview.json b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json similarity index 99% rename from monitoring/grafana/dashboards/rbd-overview.json rename to monitoring/ceph-mixin/dashboards_out/rbd-overview.json index 5f0ade741e971..71c32ce71fb86 100644 --- a/monitoring/grafana/dashboards/rbd-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json @@ -57,6 +57,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -149,6 +150,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -241,6 +243,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -336,6 +339,7 @@ "y": 7 }, "id": 5, + "links": [ ], "sort": { "col": 3, "desc": true @@ -436,6 +440,7 @@ "y": 7 }, "id": 6, + "links": [ ], "sort": { "col": 3, "desc": true @@ -536,6 +541,7 @@ "y": 7 }, "id": 7, + "links": [ ], "sort": { "col": 3, "desc": true diff --git a/monitoring/ceph-mixin/jsonnetfile.json b/monitoring/ceph-mixin/jsonnetfile.json new file mode 100644 index 0000000000000..93f3316ec3830 --- /dev/null +++ b/monitoring/ceph-mixin/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/monitoring/ceph-mixin/jsonnetfile.lock.json b/monitoring/ceph-mixin/jsonnetfile.lock.json new file mode 100644 index 0000000000000..0430b39fc3674 --- /dev/null +++ b/monitoring/ceph-mixin/jsonnetfile.lock.json @@ -0,0 +1,16 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "3626fc4dc2326931c530861ac5bebe39444f6cbf", + "sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w=" + } + ], + "legacyImports": false +} diff --git a/monitoring/ceph-mixin/lint-jsonnet.sh b/monitoring/ceph-mixin/lint-jsonnet.sh new file mode 100755 index 0000000000000..6f77162698246 --- /dev/null +++ b/monitoring/ceph-mixin/lint-jsonnet.sh @@ -0,0 +1,5 @@ +#!/bin/sh -e + +JSONNETS_FILES=$(find . -name 'vendor' -prune -o \ + -name '*.jsonnet' -print -o -name '*.libsonnet' -print) +jsonnetfmt "$@" ${JSONNETS_FILES} diff --git a/monitoring/ceph-mixin/mixin.libsonnet b/monitoring/ceph-mixin/mixin.libsonnet new file mode 100644 index 0000000000000..c89b2a916a891 --- /dev/null +++ b/monitoring/ceph-mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'config.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'alerts.libsonnet') diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yaml similarity index 99% rename from monitoring/prometheus/alerts/ceph_default_alerts.yml rename to monitoring/ceph-mixin/prometheus_alerts.yaml index d9e6e35637f92..fc38678f99dd5 100644 --- a/monitoring/prometheus/alerts/ceph_default_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yaml @@ -887,4 +887,4 @@ groups: description: | One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes don't go unseen. To acknowledge a crash, use the - 'ceph crash archive ' command. \ No newline at end of file + 'ceph crash archive ' command. diff --git a/monitoring/prometheus/tests/requirements.txt b/monitoring/ceph-mixin/requirements-alerts.txt similarity index 100% rename from monitoring/prometheus/tests/requirements.txt rename to monitoring/ceph-mixin/requirements-alerts.txt diff --git a/monitoring/grafana/dashboards/requirements-grafonnet.txt b/monitoring/ceph-mixin/requirements-grafonnet.txt similarity index 100% rename from monitoring/grafana/dashboards/requirements-grafonnet.txt rename to monitoring/ceph-mixin/requirements-grafonnet.txt diff --git a/monitoring/grafana/dashboards/requirements-lint.txt b/monitoring/ceph-mixin/requirements-lint.txt similarity index 100% rename from monitoring/grafana/dashboards/requirements-lint.txt rename to monitoring/ceph-mixin/requirements-lint.txt diff --git a/monitoring/ceph-mixin/test-jsonnet.sh b/monitoring/ceph-mixin/test-jsonnet.sh new file mode 100755 index 0000000000000..fef0443a9ade5 --- /dev/null +++ b/monitoring/ceph-mixin/test-jsonnet.sh @@ -0,0 +1,31 @@ +#!/bin/sh -e + +TEMPDIR=$(mktemp -d) +BASEDIR=$(dirname "$0") + +jsonnet -J vendor -m ${TEMPDIR} $BASEDIR/dashboards.jsonnet + +truncate -s 0 ${TEMPDIR}/json_difference.log +for file in ${BASEDIR}/dashboards_out/*.json +do + file_name="$(basename $file)" + for generated_file in ${TEMPDIR}/*.json + do + generated_file_name="$(basename $generated_file)" + if [ "$file_name" == "$generated_file_name" ]; then + jsondiff --indent 2 "${generated_file}" "${file}" \ + | tee -a ${TEMPDIR}/json_difference.log + fi + done +done + +err=0 +if [ $(wc -l < ${TEMPDIR}/json_difference.log) -eq 0 ] +then + rm -rf ${TEMPDIR} + echo "Congratulations! Grafonnet Check Passed" +else + rm -rf ${TEMPDIR} + echo "Grafonnet Check Failed, failed comparing generated file with existing" + exit 1 +fi diff --git a/monitoring/prometheus/tests/README.md b/monitoring/ceph-mixin/tests_alerts/README.md similarity index 100% rename from monitoring/prometheus/tests/README.md rename to monitoring/ceph-mixin/tests_alerts/README.md diff --git a/monitoring/grafana/dashboards/tests/features/__init__.py b/monitoring/ceph-mixin/tests_alerts/__init__.py similarity index 100% rename from monitoring/grafana/dashboards/tests/features/__init__.py rename to monitoring/ceph-mixin/tests_alerts/__init__.py diff --git a/monitoring/ceph-mixin/tests_alerts/settings.py b/monitoring/ceph-mixin/tests_alerts/settings.py new file mode 100644 index 0000000000000..9dc639fd30cb2 --- /dev/null +++ b/monitoring/ceph-mixin/tests_alerts/settings.py @@ -0,0 +1,11 @@ +import os + +ALERTS_FILE = '../prometheus_alerts.yaml' +UNIT_TESTS_FILE = 'test_alerts.yml' +MIB_FILE = '../../snmp/CEPH-MIB.txt' + +current_dir = os.path.dirname(os.path.abspath(__file__)) + +ALERTS_FILE = os.path.join(current_dir, ALERTS_FILE) +UNIT_TESTS_FILE = os.path.join(current_dir, UNIT_TESTS_FILE) +MIB_FILE = os.path.join(current_dir, MIB_FILE) diff --git a/monitoring/prometheus/tests/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml similarity index 99% rename from monitoring/prometheus/tests/test_alerts.yml rename to monitoring/ceph-mixin/tests_alerts/test_alerts.yml index cd980deb39b3c..14dfb942b62f3 100644 --- a/monitoring/prometheus/tests/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -1,5 +1,5 @@ rule_files: - - ../alerts/ceph_default_alerts.yml + - ../prometheus_alerts.yaml evaluation_interval: 5m tests: # health error @@ -1990,4 +1990,4 @@ tests: description: | One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes don't go unseen. To acknowledge a crash, use the - 'ceph crash archive ' command. \ No newline at end of file + 'ceph crash archive ' command. diff --git a/monitoring/prometheus/tests/test_syntax.py b/monitoring/ceph-mixin/tests_alerts/test_syntax.py similarity index 100% rename from monitoring/prometheus/tests/test_syntax.py rename to monitoring/ceph-mixin/tests_alerts/test_syntax.py diff --git a/monitoring/prometheus/tests/test_unittests.py b/monitoring/ceph-mixin/tests_alerts/test_unittests.py similarity index 100% rename from monitoring/prometheus/tests/test_unittests.py rename to monitoring/ceph-mixin/tests_alerts/test_unittests.py diff --git a/monitoring/prometheus/tests/utils.py b/monitoring/ceph-mixin/tests_alerts/utils.py similarity index 100% rename from monitoring/prometheus/tests/utils.py rename to monitoring/ceph-mixin/tests_alerts/utils.py diff --git a/monitoring/prometheus/tests/validate_rules.py b/monitoring/ceph-mixin/tests_alerts/validate_rules.py similarity index 98% rename from monitoring/prometheus/tests/validate_rules.py rename to monitoring/ceph-mixin/tests_alerts/validate_rules.py index 428779a47de87..c24ce5c59d553 100755 --- a/monitoring/prometheus/tests/validate_rules.py +++ b/monitoring/ceph-mixin/tests_alerts/validate_rules.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 -u +#!/usr/bin/env python3 # # Check the Prometheus rules for format, and integration # with the unit tests. This script has the following exit @@ -27,10 +27,9 @@ import urllib.error from urllib.parse import urlparse +from settings import ALERTS_FILE, MIB_FILE, UNIT_TESTS_FILE + DOCLINK_NAME = 'documentation' -DEFAULT_RULES_FILENAME = '../alerts/ceph_default_alerts.yml' -DEFAULT_TEST_FILENAME = 'test_alerts.yml' -MIB_FILE = '../../snmp/CEPH-MIB.txt' def isascii(s: str) -> bool: @@ -463,8 +462,8 @@ def report(self) -> None: class RuleChecker: def __init__(self, rules_filename: str = None, test_filename: str = None): - self.rules_filename = rules_filename or DEFAULT_RULES_FILENAME - self.test_filename = test_filename or DEFAULT_TEST_FILENAME + self.rules_filename = rules_filename or ALERTS_FILE + self.test_filename = test_filename or UNIT_TESTS_FILE self.rule_file: Optional[RuleFile] = None self.unit_tests: Optional[UnitTests] = None self.rule_file_problems: bool = False diff --git a/monitoring/grafana/dashboards/tests/__init__.py b/monitoring/ceph-mixin/tests_dashboards/__init__.py similarity index 100% rename from monitoring/grafana/dashboards/tests/__init__.py rename to monitoring/ceph-mixin/tests_dashboards/__init__.py diff --git a/monitoring/prometheus/tests/__init__.py b/monitoring/ceph-mixin/tests_dashboards/features/__init__.py similarity index 100% rename from monitoring/prometheus/tests/__init__.py rename to monitoring/ceph-mixin/tests_dashboards/features/__init__.py diff --git a/monitoring/grafana/dashboards/tests/features/ceph-cluster.feature b/monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/ceph-cluster.feature rename to monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature diff --git a/monitoring/grafana/dashboards/tests/features/environment.py b/monitoring/ceph-mixin/tests_dashboards/features/environment.py similarity index 97% rename from monitoring/grafana/dashboards/tests/features/environment.py rename to monitoring/ceph-mixin/tests_dashboards/features/environment.py index 8509b9d97e8e2..5dc76a09e41d0 100644 --- a/monitoring/grafana/dashboards/tests/features/environment.py +++ b/monitoring/ceph-mixin/tests_dashboards/features/environment.py @@ -5,8 +5,8 @@ from behave import given, then, when from prettytable import PrettyTable -from tests import PromqlTest -from tests.util import get_dashboards_data, resolve_time_and_unit +from tests_dashboards import PromqlTest +from tests_dashboards.util import get_dashboards_data, resolve_time_and_unit class GlobalContext: diff --git a/monitoring/grafana/dashboards/tests/features/host-details.feature b/monitoring/ceph-mixin/tests_dashboards/features/host-details.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/host-details.feature rename to monitoring/ceph-mixin/tests_dashboards/features/host-details.feature diff --git a/monitoring/grafana/dashboards/tests/features/hosts_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/hosts_overview.feature rename to monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature diff --git a/monitoring/grafana/dashboards/tests/features/osd-device-details.feature b/monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/osd-device-details.feature rename to monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature diff --git a/monitoring/grafana/dashboards/tests/features/osds-overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/osds-overview.feature rename to monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature diff --git a/monitoring/grafana/dashboards/tests/features/radosgw-detail.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/radosgw-detail.feature rename to monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature diff --git a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/radosgw_overview.feature rename to monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature diff --git a/monitoring/grafana/dashboards/tests/features/self.feature b/monitoring/ceph-mixin/tests_dashboards/features/self.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/self.feature rename to monitoring/ceph-mixin/tests_dashboards/features/self.feature diff --git a/monitoring/grafana/dashboards/tests/features/steps/__init__.py b/monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py similarity index 100% rename from monitoring/grafana/dashboards/tests/features/steps/__init__.py rename to monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py diff --git a/monitoring/grafana/dashboards/tests/requirements.txt b/monitoring/ceph-mixin/tests_dashboards/requirements.txt similarity index 100% rename from monitoring/grafana/dashboards/tests/requirements.txt rename to monitoring/ceph-mixin/tests_dashboards/requirements.txt diff --git a/monitoring/grafana/dashboards/tests/util.py b/monitoring/ceph-mixin/tests_dashboards/util.py similarity index 97% rename from monitoring/grafana/dashboards/tests/util.py rename to monitoring/ceph-mixin/tests_dashboards/util.py index 8bc097875d77c..4310eb207f058 100644 --- a/monitoring/grafana/dashboards/tests/util.py +++ b/monitoring/ceph-mixin/tests_dashboards/util.py @@ -23,7 +23,8 @@ def resolve_time_and_unit(time: str) -> Union[Tuple[int, str], Tuple[None, None] def get_dashboards_data() -> Dict[str, Any]: data: Dict[str, Any] = {'queries': {}, 'variables': {}, 'stats': {}} - for file in Path(__file__).parent.parent.glob('*.json'): + for file in Path(__file__).parent.parent \ + .joinpath('dashboards_out').glob('*.json'): with open(file, 'r') as f: dashboard_data = json.load(f) data['stats'][str(file)] = {'total': 0, 'tested': 0} diff --git a/monitoring/ceph-mixin/tox.ini b/monitoring/ceph-mixin/tox.ini new file mode 100644 index 0000000000000..e15e17084f7ce --- /dev/null +++ b/monitoring/ceph-mixin/tox.ini @@ -0,0 +1,69 @@ +[tox] +envlist = lint,jsonnet-{check,lint,fix},promql-query-{test,lint},alerts-check +skipsdist = true + +[testenv:jsonnet-bundler-{install,update}] +whitelist_externals = + jb +description = + install: Install the jsonnet dependencies + update: Update the jsonnet dependencies +commands = + install: jb install + update: jb update + +[testenv:jsonnet-{check,fix,lint}] +basepython = python3 +whitelist_externals = + find + jb + jsonnet + jsonnetfm + sh +description = + check: Ensure that auto-generated files matches the current version + fix: Update generated files from jsonnet filse with latest changes + lint: Test if jsonnet files are linted (without any update) +deps = + -rrequirements-grafonnet.txt +depends = jsonnet-bundler-install +commands = + check: sh test-jsonnet.sh + lint: ./lint-jsonnet.sh --test + fix: jsonnet -J vendor -m dashboards_out dashboards.jsonnet + +[testenv:lint] +description = + Run python linters +deps = + -rrequirements-lint.txt +setenv = +commands = + pylint --rcfile=.pylintrc tests_dashboards + mypy tests_dashboards + isort tests_dashboards + +[testenv:promql-query-test] +description = + lint: Run promtool check on grafana queries + test: Run promtool unit testing on grafana queries. +deps = + -rrequirements-lint.txt +depends = grafonnet-check +setenv = +whitelist_externals = + promtool +commands = + behave tests_dashboards/features + +[testenv:alerts-{check,lint}] +deps = + -rrequirements-alerts.txt + pytest +depends = grafonnet-check +whitelist_externals = + promtool +commands = + lint: promtool check rules prometheus_alerts.yaml + test: pytest -rA tests_alerts/test_syntax.py tests_alerts/test_unittests.py + python3 ./tests_alerts/validate_rules.py diff --git a/monitoring/grafana/README.md b/monitoring/grafana/README.md deleted file mode 100644 index b4bf4ec3273d0..0000000000000 --- a/monitoring/grafana/README.md +++ /dev/null @@ -1,14 +0,0 @@ -## Grafana dashboards for Ceph - -Here you can find a collection of [Grafana](https://grafana.com/grafana) -dashboards for Ceph Monitoring. These dashboards are based on metrics collected -from [prometheus](https://prometheus.io/) scraping the [prometheus mgr -plugin](http://docs.ceph.com/en/latest/mgr/prometheus/) and the -[node_exporter](https://github.com/prometheus/node_exporter). - -### Other requirements - -- Luminous 12.2.5 or newer -- [Status Panel](https://grafana.com/plugins/vonage-status-panel) installed -- node_exporter 0.15.x and 0.16.x are supported (host details and hosts -overview dashboards) diff --git a/monitoring/grafana/build/Makefile b/monitoring/grafana/build/Makefile index 6b2a0a9f91902..ace8dec3a2e1c 100755 --- a/monitoring/grafana/build/Makefile +++ b/monitoring/grafana/build/Makefile @@ -2,7 +2,7 @@ GRAFANA_VERSION ?= 6.7.4-1 PIECHART_VERSION ?= "1.6.1" STATUS_PANEL_VERSION ?= "1.0.9" -DASHBOARD_DIR := "../dashboards" +DASHBOARD_DIR := "../../ceph-mixin/dashboards_out" DASHBOARD_PROVISIONING := "ceph-dashboard.yml" IMAGE := "docker.io/centos:8" PKGMGR := "dnf" diff --git a/monitoring/grafana/dashboards/.pylintrc b/monitoring/grafana/dashboards/.pylintrc deleted file mode 120000 index aa04b020cb4c1..0000000000000 --- a/monitoring/grafana/dashboards/.pylintrc +++ /dev/null @@ -1 +0,0 @@ -../../../src/pybind/mgr/dashboard/.pylintrc \ No newline at end of file diff --git a/monitoring/grafana/dashboards/CMakeLists.txt b/monitoring/grafana/dashboards/CMakeLists.txt deleted file mode 100644 index c344958199087..0000000000000 --- a/monitoring/grafana/dashboards/CMakeLists.txt +++ /dev/null @@ -1,41 +0,0 @@ -set(CEPH_GRAFANA_DASHBOARDS_DIR "${CMAKE_INSTALL_SYSCONFDIR}/grafana/dashboards/ceph-dashboard" - CACHE PATH "Location for grafana dashboards") -file(GLOB CEPH_GRAFANA_DASHBOARDS "*.json") -install(FILES - ${CEPH_GRAFANA_DASHBOARDS} - DESTINATION ${CEPH_GRAFANA_DASHBOARDS_DIR}) - -set(CEPH_BUILD_VIRTUALENV $ENV{TMPDIR}) -if(NOT CEPH_BUILD_VIRTUALENV) - set(CEPH_BUILD_VIRTUALENV ${CMAKE_BINARY_DIR}) -endif() - -if(WITH_GRAFANA) - if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm|ARM") - include(AddCephTest) - add_tox_test(grafana-check TOX_ENVS grafonnet-check) - add_tox_test(promql-query-test TOX_ENVS promql-query-test) - add_tox_test(grafana-lint TOX_ENVS lint) - set(ver 0.1.0) - set(name grafonnet-lib) - include(ExternalProject) - ExternalProject_Add(${name} - URL https://github.com/grafana/${name}/archive/v${ver}/${name}-${ver}.tar.gz - URL_MD5 0798752ed40864fa8b3db40a3c970642 - BUILD_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - LOG_DOWNLOAD ON - LOG_MERGED_STDOUTERR ON - LOG_OUTPUT_ON_FAILURE ON - EXCLUDE_FROM_ALL ON) - add_dependencies(tests - ${name}) - ExternalProject_Get_Property(${name} SOURCE_DIR) - set_property( - TEST run-tox-grafana-check run-tox-promql-query-test run-tox-grafana-lint - APPEND - PROPERTY ENVIRONMENT - GRAFONNET_PATH=${SOURCE_DIR}/grafonnet) - endif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm|ARM") -endif() diff --git a/monitoring/grafana/dashboards/README b/monitoring/grafana/dashboards/README deleted file mode 100644 index 3803cd7a00422..0000000000000 --- a/monitoring/grafana/dashboards/README +++ /dev/null @@ -1,28 +0,0 @@ -Context -These dashboards should be enough to get started on the integration. It's not a complete set, so more will be added in the next week. - -Bare in mind that the osd device details dashboard needs node_exporter active - all the other dashboards pick data out of ceph-mgr based metrics. - - -The cephfs dashboard only has 2 panels currently. The counter available are -a little light at the moment. Patrick/Venky have been addressing this with -https://bugzilla.redhat.com/show_bug.cgi?id=1618523 -cephfs-overview.json - -Host Information -host-details.json combines generic server metrics that show cpu/memory/network stats (including network errors/drops), -with disk level stats for OSD hosts. OSD charts show the physical device name together with it's corresponding osd id for correlation. - -Ceph Pools -two dashboards. Overview gives the high level combined view, pool-detail needs a pool_name variable passed to it (currently uses a templating var which is visible) -pool-overview.json -pool-detail.json - -OSD Device Details. This dashboard needs some further work. It currently shows -OSD level stats with physical device stats but leaves out some of the counters -that cephmetrics provides for trouble shooting. -osd-device-details.json - -Object gateway dashboards, again split into overview and detail. The detail dashboard needs the relevant ceph-deamon name for the rgw instance. -radosgw-overview.json -radosgw-detail.json diff --git a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet b/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet deleted file mode 100644 index 791e4568db178..0000000000000 --- a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet +++ /dev/null @@ -1,1509 +0,0 @@ -local g = import 'grafana.libsonnet'; - -local dashboardSchema(title, description, uid, time_from, refresh, schemaVersion, tags, timezone, timepicker) = - g.dashboard.new(title=title, description=description, uid=uid, time_from=time_from, refresh=refresh, schemaVersion=schemaVersion, tags=tags, timezone=timezone, timepicker=timepicker); - -local graphPanelSchema(aliasColors, title, description, nullPointMode, stack, formatY1, formatY2, labelY1, labelY2, min, fill, datasource, legend_alignAsTable=false, legend_avg=false, legend_min=false, legend_max=false, legend_current=false, legend_values=false) = - g.graphPanel.new(aliasColors=aliasColors, title=title, description=description, nullPointMode=nullPointMode, stack=stack, formatY1=formatY1, formatY2=formatY2, labelY1=labelY1, labelY2=labelY2, min=min, fill=fill, datasource=datasource, legend_alignAsTable=legend_alignAsTable, legend_avg=legend_avg, legend_min=legend_min, legend_max=legend_max, legend_current=legend_current, legend_values=legend_values); - -local addTargetSchema(expr, intervalFactor, format, legendFormat) = - g.prometheus.target(expr=expr, intervalFactor=intervalFactor, format=format, legendFormat=legendFormat); - -local addTemplateSchema(name, datasource, query, refresh, includeAll, sort, label, regex) = - g.template.new(name=name, datasource=datasource, query=query, refresh=refresh, includeAll=includeAll, sort=sort, label=label, regex=regex); - -local addAnnotationSchema(builtIn, datasource, enable, hide, iconColor, name, type) = - g.annotation.datasource(builtIn=builtIn, datasource=datasource, enable=enable, hide=hide, iconColor=iconColor, name=name, type=type); - -local addRowSchema(collapse, showTitle, title) = - g.row.new(collapse=collapse, showTitle=showTitle, title=title); - -local addSingelStatSchema(colors, datasource, format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparklineShow, thresholds) = - g.singlestat.new(colors=colors, datasource=datasource, format=format, title=title, description=description, valueName=valueName, colorValue=colorValue, gaugeMaxValue=gaugeMaxValue, gaugeShow=gaugeShow, sparklineShow=sparklineShow, thresholds=thresholds); - -local addPieChartSchema(aliasColors, datasource, description, legendType, pieType, title, valueName) = - g.pieChartPanel.new(aliasColors=aliasColors, datasource=datasource, description=description, legendType=legendType, pieType=pieType, title=title, valueName=valueName); - -local addTableSchema(datasource, description, sort, styles, title, transform) = - g.tablePanel.new(datasource=datasource, description=description, sort=sort, styles=styles, title=title, transform=transform); - -local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, pattern, thresholds, type, unit, valueMaps) = - {'alias': alias, 'colorMode': colorMode, 'colors':colors, 'dateFormat':dateFormat, 'decimals':decimals, 'mappingType':mappingType, 'pattern':pattern, 'thresholds':thresholds, 'type':type, 'unit':unit, 'valueMaps':valueMaps}; - -{ - "hosts-overview.json": - local HostsOverviewSingleStatPanel(format, title, description, valueName, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(['#299c46','rgba(237, 129, 40, 0.89)','#d44a3a'], '$datasource', format, title, description, valueName, false, 100, false, false, '') - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - local HostsOverviewGraphPanel(title, description, formatY1, expr, legendFormat, x, y, w, h) = - graphPanelSchema({}, title, description, 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'Host Overview', '', 'y0KGL0iZz', 'now-1h', '10s', 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('osd_hosts', '$datasource', 'label_values(ceph_disk_occupation, exported_instance)', 1, true, 1, null, '([^.]*).*') - ) - .addTemplate( - addTemplateSchema('mon_hosts', '$datasource', 'label_values(ceph_mon_metadata, ceph_daemon)', 1, true, 1, null, 'mon.(.*)') - ) - .addTemplate( - addTemplateSchema('mds_hosts', '$datasource', 'label_values(ceph_mds_inodes, ceph_daemon)', 1, true, 1, null, 'mds.(.*)') - ) - .addTemplate( - addTemplateSchema('rgw_hosts', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, null, 'rgw.(.*)') - ) - .addPanels([ - HostsOverviewSingleStatPanel( - 'none', - 'OSD Hosts', - '', - 'current', - 'count(sum by (hostname) (ceph_osd_metadata))', - 'time_series', - 0, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'percentunit', - 'AVG CPU Busy', - 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster', - 'current', - 'avg(\n 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode=\'idle\',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[1m]) or\n irate(node_cpu{mode=\'idle\',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[1m]))\n )\n )', - 'time_series', - 4, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'percentunit', - 'AVG RAM Utilization', - 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)', - 'current', - 'avg (((node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"})- (\n (node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + \n (node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + \n (node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +\n (node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"})\n )) /\n (node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} ))', - 'time_series', - 8, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'none', - 'Physical IOPS', - 'IOPS Load at the device as reported by the OS on all OSD hosts', - 'current', - 'sum ((irate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[5m]) ) + \n(irate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[5m]) or irate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[5m])))', - 'time_series', - 12, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'percent', - 'AVG Disk Utilization', - 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)', - 'current', - 'avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), "instance", "$1", "instance", "([^.:]*).*"\n ) *\n on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($osd_hosts).*"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^.:]*).*")\n)', - 'time_series', - 16, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'bytes', - 'Network Load', - 'Total send/receive network load across all hosts in the ceph cluster', - 'current', - 'sum (\n\t(\n\t\tirate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or\n\t\tirate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")\n) +\nsum (\n\t(\n\t\tirate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or\n\t\tirate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")\n\t)\n' - , 'time_series', - 20, 0, 4, 5 - ), - HostsOverviewGraphPanel( - 'CPU Busy - Top 10 Hosts', - 'Show the top 10 busiest hosts by cpu', - 'percent', - 'topk(10,100 * ( 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode=\'idle\',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[1m]) or\n irate(node_cpu{mode=\'idle\',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[1m]))\n )\n )\n)', - '{{instance}}', - 0, 5, 12, 9 - ), - HostsOverviewGraphPanel( - 'Network Load - Top 10 Hosts', - 'Top 10 hosts by network load', - 'Bps', - 'topk(10, (sum by(instance) (\n(\n\tirate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or\n\tirate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m])\n) +\n(\n\tirate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or\n\tirate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m])\n) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)"))\n))\n' - , - '{{instance}}', - 12, 5, 12, 9 - ), - ]) -} -{ - "host-details.json": - local HostDetailsSingleStatPanel(format, title, description, valueName, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(['#299c46','rgba(237, 129, 40, 0.89)','#d44a3a'], '$datasource', format, title, description, valueName, false, 100, false, false, '') - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - local HostDetailsGraphPanel(alias, title, description, nullPointMode, formatY1, labelY1, expr, legendFormat, x, y, w, h) = - graphPanelSchema(alias, title, description, nullPointMode, false, formatY1, 'short', labelY1, null, null, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'Host Details', '', 'rtOg0AiWz', 'now-1h', '10s', 16, ['overview'], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('ceph_hosts', '$datasource', 'label_values(node_scrape_collector_success, instance) ', 1, false, 3, 'Hostname', '([^.:]*).*') - ) - .addPanels([ - addRowSchema(false, true, '$ceph_hosts System Overview') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - HostDetailsSingleStatPanel( - 'none', - 'OSDs', - '', - 'current', - 'count(sum by (ceph_daemon) (ceph_osd_metadata{hostname=\'$ceph_hosts\'}))', - 'time_series', - 0, 1, 3, 5 - ), - HostDetailsGraphPanel( - {"interrupt": "#447EBC","steal": "#6D1F62","system": "#890F02","user": "#3F6833","wait": "#C15C17"}, - 'CPU Utilization', - 'Shows the CPU breakdown. When multiple servers are selected, only the first host\'s cpu data is shown', - 'null', - 'percent', - '% Utilization', - 'sum by (mode) (\n irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m]) or\n irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m])\n) / scalar(\n sum(irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]) or\n irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]))\n) * 100', - '{{mode}}', - 3, 1, 6, 10 - ), - HostDetailsGraphPanel( - {"Available": "#508642","Free": "#508642","Total": "#bf1b00","Used": "#bf1b00","total": "#bf1b00","used": "#0a50a1"}, - 'RAM Usage', - '', - 'null', - 'bytes', - 'RAM used', - 'node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ', - 'Free', - 9, 1, 6, 10) - .addTargets( - [ - addTargetSchema( - 'node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ', - 1, - 'time_series', - 'total' - ), - addTargetSchema( - '(node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n(node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) +\n(node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) \n', - 1, - 'time_series', - 'buffers/cache' - ), - addTargetSchema( - '(node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"})- (\n (node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n (node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n (node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) +\n (node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"})\n )\n \n', - 1, - 'time_series', - 'used' - )]) - .addSeriesOverride({"alias": "total","color": "#bf1b00","fill": 0,"linewidth": 2,"stack": false} - ), - HostDetailsGraphPanel( - {}, - 'Network Load', - 'Show the network load (rx,tx) across all interfaces (excluding loopback \'lo\')', - 'null', - 'decbytes', - 'Send (-) / Receive (+)', - 'sum by (device) (\n irate(node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or \n irate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m])\n)', - '{{device}}.rx', - 15, 1, 6, 10 - ) - .addTargets( - [ - addTargetSchema( - 'sum by (device) (\n irate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m])\n)', - 1, - 'time_series', - '{{device}}.tx' - )]) - .addSeriesOverride({"alias": "/.*tx/","transform": "negative-Y"} - ), - HostDetailsGraphPanel( - {}, - 'Network drop rate', - '', - 'null', - 'pps', - 'Send (-) / Receive (+)', - 'irate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', - '{{device}}.rx', - 21, 1, 3, 5 - ) - .addTargets( - [ - addTargetSchema( - 'irate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', - 1, - 'time_series', - '{{device}}.tx' - )]) - .addSeriesOverride({"alias": "/.*tx/","transform": "negative-Y"} - ), - HostDetailsSingleStatPanel( - 'bytes', - 'Raw Capacity', - 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.', - 'current', - 'sum(ceph_osd_stat_bytes and on (ceph_daemon) ceph_disk_occupation{instance=~"($ceph_hosts)([\\\\.:].*)?"})', - 'time_series', - 0, 6, 3, 5 - ), - HostDetailsGraphPanel( - {}, - 'Network error rate', - '', - 'null', - 'pps', - 'Send (-) / Receive (+)', - 'irate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', - '{{device}}.rx', - 21, 6, 3, 5 - ) - .addTargets( - [ - addTargetSchema( - 'irate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', - 1, - 'time_series', - '{{device}}.tx' - )]) - .addSeriesOverride({"alias": "/.*tx/","transform": "negative-Y"} - ), - addRowSchema(false, true, 'OSD Disk Performance Statistics') + {gridPos: {x: 0, y: 11, w: 24, h: 1}}, - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk IOPS', - 'For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it\'s name and corresponding OSD id value', - 'connected', - 'ops', - 'Read (-) / Write (+)', - 'label_replace(\n (\n irate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )', - '{{device}}({{ceph_daemon}}) writes', - 0, 12, 11, 9 - ) - .addTargets( - [ - addTargetSchema( - 'label_replace(\n (irate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )', - 1, - 'time_series', - '{{device}}({{ceph_daemon}}) reads' - )]) - .addSeriesOverride({"alias": "/.*reads/","transform": "negative-Y"} - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Throughput by Disk', - 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id', - 'connected', - 'Bps', - 'Read (-) / Write (+)', - 'label_replace((irate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{device}}({{ceph_daemon}}) write', - 12, 12, 11, 9 - ) - .addTargets( - [ - addTargetSchema( - 'label_replace((irate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - 1, - 'time_series', - '{{device}}({{ceph_daemon}}) read' - )]) - .addSeriesOverride({"alias": "/.*read/","transform": "negative-Y"} - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk Latency', - 'For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it\'s corresponding OSD id', - 'null as zero', - 's', - '', - 'max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001), "instance", "$1", "instance", "([^:.]*).*")) * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{device}}({{ceph_daemon}})', - 0, 21, 11, 9 - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk utilization', - 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.', - 'connected', - 'percent', - '%Util', - 'label_replace(((irate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) * 100), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{device}}({{ceph_daemon}})', - 12, 21, 11, 9 - ) - ]) -} -{ - "radosgw-sync-overview.json": - local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) = - graphPanelSchema({}, title, '', 'null as zero', true, formatY1, 'short', labelY1, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric, 1, 'time_series', '{{source_zone}}')]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RGW Sync Overview', '', 'rgw-sync-overview', 'now-1h', '15s', 16, ["overview"], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addPanels([ - RgwSyncOverviewPanel( - 'Replication (throughput) from Source Zone', - 'Bps', - null, - 'ceph_data_sync_from_zone_fetch_bytes_sum', - 0, 0, 8, 7 - ), - RgwSyncOverviewPanel( - 'Replication (objects) from Source Zone', - 'short', - 'Objects/s', - 'ceph_data_sync_from_zone_fetch_bytes_count', - 8, 0, 8, 7 - ), - RgwSyncOverviewPanel( - 'Polling Request Latency from Source Zone', - 'ms', - null, - 'ceph_data_sync_from_zone_poll_latency_sum', - 16, 0, 8, 7 - ), - RgwSyncOverviewPanel( - 'Unsuccessful Object Replications from Source Zone', - 'short', - 'Count/s', - 'ceph_data_sync_from_zone_fetch_errors', - 0, 7, 8, 7 - ) - ]) -} -{ - "radosgw-overview.json": - local RgwOverviewPanel(title, description, formatY1, formatY2, expr1, legendFormat1, x, y, w, h, datasource='$datasource', legend_alignAsTable=false, legend_avg=false, legend_min=false, legend_max=false, legend_current=false, legend_values=false) = - graphPanelSchema({}, title, description, 'null', false, formatY1, formatY2, null, null, 0, 1, datasource, legend_alignAsTable, legend_avg, legend_min, legend_max, legend_current, legend_values) - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', legendFormat1)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RGW Overview', '', 'WAkugZpiz', 'now-1h', '15s', 16, ['overview'], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, '', '') - ) - .addTemplate( - addTemplateSchema('code', '$datasource', 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)', 1, true, 1, 'HTTP Code', '') - ) - .addTemplate( - addTemplateSchema('ingress_service', '$datasource', 'label_values(haproxy_server_status, instance)', 1, true, 1, 'Ingress Service', '') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addPanels([ - addRowSchema(false, true, 'RGW Overview - All Gateways') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - RgwOverviewPanel( - 'Average GET/PUT Latencies', - '', - 's', - 'short', - 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', - 'GET AVG', - 0, 1, 8, 7 - ) - .addTargets( - [ - addTargetSchema( - 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', - 1, - 'time_series', - 'PUT AVG' - )]), - RgwOverviewPanel( - 'Total Requests/sec by RGW Instance', - '', - 'none', - 'short', - 'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))', - '{{rgw_host}}', - 8, 1, 7, 7 - ), - RgwOverviewPanel( - 'GET Latencies by RGW Instance', - 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', - 's', - 'short', - 'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', - '{{rgw_host}}', - 15, 1, 6, 7 - ), - RgwOverviewPanel( - 'Bandwidth Consumed by Type', - 'Total bytes transferred in/out of all radosgw instances within the cluster', - 'bytes', - 'short', - 'sum(rate(ceph_rgw_get_b[30s]))', - 'GETs', - 0, 8, 8, 6 - ) - .addTargets( - [ - addTargetSchema( - 'sum(rate(ceph_rgw_put_b[30s]))', - 1, - 'time_series', - 'PUTs' - )]), - RgwOverviewPanel( - 'Bandwidth by RGW Instance', - 'Total bytes transferred in/out through get/put operations, by radosgw instance', - 'bytes', - 'short', - 'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', - '{{rgw_host}}', - 8, 8, 7, 6 - ), - RgwOverviewPanel( - 'PUT Latencies by RGW Instance', - 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', - 's', - 'short', - 'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', - '{{rgw_host}}', - 15, 8, 6, 6 - ), - addRowSchema(false, true, 'RGW Overview - HAProxy Metrics') + {gridPos: {x: 0, y: 12, w: 9, h: 12}}, - RgwOverviewPanel( - 'Total responses by HTTP code', - '', - 'short', - 'short', - 'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)', - 'Frontend {{ code }}', - 0, 12, 5, 12, - '$datasource', - true, true, true, true, true, true) - .addTargets( - [ - addTargetSchema( - 'sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', - 1, - 'time_series', - 'Backend {{ code }}' - )]) - .addSeriesOverride([ - { "alias": "/.*Back.*/", - "transform": "negative-Y" }, - { "alias": "/.*1.*/" }, - { "alias": "/.*2.*/" }, - { "alias": "/.*3.*/" }, - { "alias": "/.*4.*/" }, - { "alias": "/.*5.*/" }, - { "alias": "/.*other.*/" } - ]), - RgwOverviewPanel( - 'Total requests / responses', - '', - 'short', - 'short', - 'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', - 'Requests', - 5, 12, 5, 12, - '$datasource', - true, true, true, true, true, true) - .addTargets( - [ - addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'), - addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'), - addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'), - addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'), - addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'), - addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'), - ]) - .addSeriesOverride([ - { - "alias": "/.*Response.*/", - "transform": "negative-Y" - }, - { - "alias": "/.*Backend.*/", - "transform": "negative-Y" - } - ]), - RgwOverviewPanel( - 'Total number of connections', - '', - 'short', - 'short', - 'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', - 'Front', - 10, 12, 5, 12, - '$datasource', - true, true, true, true, true, true) - .addTargets( - [ - addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'), - addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'), - ]) - .addSeriesOverride([ - { - "alias": "/.*Back.*/", - "transform": "negative-Y" - } - ]), - RgwOverviewPanel( - 'Current total of incoming / outgoing bytes', - '', - 'short', - 'short', - 'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 'IN Front', 15, 12, 6, 12, '$datasource', true, true, true, true, true, true) - .addTargets( - [ - addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'), - addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'), - addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back') - ]) - .addSeriesOverride([ - { - "alias": "/.*OUT.*/", - "transform": "negative-Y" - } - ]) - ]) -} -{ - "radosgw-detail.json": - local RgwDetailsPanel(aliasColors, title, description, formatY1, formatY2, expr1, expr2, legendFormat1, legendFormat2, x, y, w, h) = - graphPanelSchema(aliasColors, title, description, 'null', false, formatY1, formatY2, null, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', legendFormat1),addTargetSchema(expr2, 1, 'time_series', legendFormat2)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RGW Instance Detail', '', 'x5ARzZtmk', 'now-1h', '15s', 16, ['overview'], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, '', '') - ) - .addPanels([ - addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - RgwDetailsPanel( - {}, - '$rgw_servers GET/PUT Latencies', - '', - 's', - 'short', - 'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'GET {{ceph_daemon}}', - 'PUT {{ceph_daemon}}', - 0, 1, 6, 8 - ), - RgwDetailsPanel( - {}, - 'Bandwidth by HTTP Operation', - '', - 'bytes', - 'short', - 'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'GETs {{ceph_daemon}}', - 'PUTs {{ceph_daemon}}', - 6, 1, 7, 8 - ), - RgwDetailsPanel( - {"GETs": "#7eb26d","Other": "#447ebc","PUTs": "#eab839","Requests": "#3f2b5b","Requests Failed": "#bf1b00"}, - 'HTTP Request Breakdown', - '', - 'short', - 'short', - 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'Requests Failed {{ceph_daemon}}', - 'GETs {{ceph_daemon}}', - 13, 1, 7, 8 - ) - .addTargets( - [ - addTargetSchema( - 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 1, - 'time_series', - 'PUTs {{ceph_daemon}}' - ), - addTargetSchema( - '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 1, - 'time_series', - 'Other {{ceph_daemon}}' - )]), - addPieChartSchema( - {"GETs": "#7eb26d","Other (HEAD,POST,DELETE)": "#447ebc","PUTs": "#eab839","Requests": "#3f2b5b","Failures": "#bf1b00"}, - '$datasource', - '', - 'Under graph', - 'pie', - 'Workload Breakdown', - 'current' - ) - .addTarget(addTargetSchema('rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Failures {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'GETs {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'PUTs {{ceph_daemon}}')) - .addTarget(addTargetSchema('(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Other (DELETE,LIST) {{ceph_daemon}}')) + {gridPos: {x: 20, y: 1, w: 4, h: 8}} - ]) -} -{ - "rbd-details.json": - local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) = - graphPanelSchema({}, title, '', 'null as zero', false, formatY1, formatY1, null, null, 0, 1, '$Datasource') - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', '{{pool}} Write'),addTargetSchema(expr2, 1, 'time_series', '{{pool}} Read')]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RBD Details', 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)', 'YhCYGcuZz', 'now-1h', false, 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('Datasource', 'prometheus', 'default', label=null) - ) - .addTemplate( - addTemplateSchema('Pool', '$Datasource', 'label_values(pool)', 1, false, 0, '', '') - ) - .addTemplate( - addTemplateSchema('Image', '$Datasource', 'label_values(image)', 1, false, 0, '', '') - ) - .addPanels([ - RbdDetailsPanel( - 'IOPS', - 'iops', - 'irate(ceph_rbd_write_ops{pool="$Pool", image="$Image"}[30s])', - 'irate(ceph_rbd_read_ops{pool="$Pool", image="$Image"}[30s])', - 0, 0, 8, 9 - ), - RbdDetailsPanel( - 'Throughput', - 'Bps', - 'irate(ceph_rbd_write_bytes{pool="$Pool", image="$Image"}[30s])', - 'irate(ceph_rbd_read_bytes{pool="$Pool", image="$Image"}[30s])', - 8, 0, 8, 9 - ), - RbdDetailsPanel( - 'Average Latency', - 'ns', - 'irate(ceph_rbd_write_latency_sum{pool="$Pool", image="$Image"}[30s]) / irate(ceph_rbd_write_latency_count{pool="$Pool", image="$Image"}[30s])', - 'irate(ceph_rbd_read_latency_sum{pool="$Pool", image="$Image"}[30s]) / irate(ceph_rbd_read_latency_count{pool="$Pool", image="$Image"}[30s])', - 16, 0, 8, 9 - ) - ]) -} -{ - "rbd-overview.json": - local RgwOverviewStyle(alias, pattern, type, unit) = - addStyle(alias, null, ["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []); - local RbdOverviewPanel(title, formatY1, expr1, expr2, legendFormat1, legendFormat2, x, y, w, h) = - graphPanelSchema({}, title, '', 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', legendFormat1),addTargetSchema(expr2, 1, 'time_series', legendFormat2)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RBD Overview', '', '41FrpeUiz', 'now-1h', '30s', 16, ["overview"], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.4.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='datasource', id='prometheus', name='Prometheus', version='5.0.0' - ) - .addRequired( - type='panel', id='table', name='Table', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addPanels([ - RbdOverviewPanel( - 'IOPS', - 'short', - 'round(sum(irate(ceph_rbd_write_ops[30s])))', - 'round(sum(irate(ceph_rbd_read_ops[30s])))', - 'Writes', - 'Reads', - 0, 0, 8, 7 - ), - RbdOverviewPanel( - 'Throughput', - 'Bps', - 'round(sum(irate(ceph_rbd_write_bytes[30s])))', - 'round(sum(irate(ceph_rbd_read_bytes[30s])))', - 'Write', - 'Read', - 8, 0, 8, 7 - ), - RbdOverviewPanel( - 'Average Latency', - 'ns', - 'round(sum(irate(ceph_rbd_write_latency_sum[30s])) / sum(irate(ceph_rbd_write_latency_count[30s])))', - 'round(sum(irate(ceph_rbd_read_latency_sum[30s])) / sum(irate(ceph_rbd_read_latency_count[30s])))', - 'Write', - 'Read', - 16, 0, 8, 7 - ), - addTableSchema( - '$datasource', - '', - {"col": 3,"desc": true}, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'),RgwOverviewStyle('Image', 'image', 'string', 'short'),RgwOverviewStyle('IOPS', 'Value', 'number', 'iops'), RgwOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest IOPS', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10, (sort((irate(ceph_rbd_write_ops[30s]) + on (image, pool, namespace) irate(ceph_rbd_read_ops[30s])))))', - 1, - 'table', - '') - ) + {gridPos: {x: 0, y: 7, w: 8, h: 7}}, - addTableSchema( - '$datasource', - '', - {"col": 3,"desc": true}, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'),RgwOverviewStyle('Image', 'image', 'string', 'short'),RgwOverviewStyle('Throughput', 'Value', 'number', 'Bps'), RgwOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest Throughput', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10, sort(sum(irate(ceph_rbd_read_bytes[30s]) + irate(ceph_rbd_write_bytes[30s])) by (pool, image, namespace)))', - 1, - 'table', - '' - ) - ) + {gridPos: {x: 8, y: 7, w: 8, h: 7}}, - addTableSchema( - '$datasource', - '', - {"col": 3,"desc": true}, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'),RgwOverviewStyle('Image', 'image', 'string', 'short'),RgwOverviewStyle('Latency', 'Value', 'number', 'ns'), RgwOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest Latency', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10,\n sum(\n irate(ceph_rbd_write_latency_sum[30s]) / clamp_min(irate(ceph_rbd_write_latency_count[30s]), 1) +\n irate(ceph_rbd_read_latency_sum[30s]) / clamp_min(irate(ceph_rbd_read_latency_count[30s]), 1)\n ) by (pool, image, namespace)\n)', - 1, - 'table', - '' - ) - ) + {gridPos: {x: 16, y: 7, w: 8, h: 7}} - ]) -} -{ - "pool-overview.json": - local PoolOverviewSingleStatPanel(format, title, description, valueName, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(['#299c46','rgba(237, 129, 40, 0.89)','#d44a3a'], '$datasource', format, title, description, valueName, false, 100, false, false, '') - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - local PoolOverviewStyle(alias, pattern, type, unit, colorMode, thresholds, valueMaps) = - addStyle(alias, colorMode, ["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, thresholds, type, unit, valueMaps); - - local PoolOverviewGraphPanel(title, description, formatY1, labelY1, expr, targetFormat, legendFormat, x, y, w, h) = - graphPanelSchema({}, title, description, 'null as zero', false, formatY1, 'short', labelY1, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'Ceph Pools Overview', '', 'z99hzWtmk', 'now-1h', '15s', 22, [], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'Dashboard1', label='Data Source') - ) - .addTemplate( - g.template.custom(label='TopK', name='topk', current='15', query='15') - ) - .addPanels([ - PoolOverviewSingleStatPanel( - 'none', - 'Pools', - '', - 'avg', - 'count(ceph_pool_metadata)', - 'table', - 0, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'none', - 'Pools with Compression', - 'Count of the pools that have compression enabled', 'current', 'count(ceph_pool_metadata{compression_mode!="none"})', - '', - 3, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Total Raw Capacity', - 'Total raw capacity available to the cluster', - 'current', - 'sum(ceph_osd_stat_bytes)', - '', - 6, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Raw Capacity Consumed', - 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)', - 'current', - 'sum(ceph_pool_bytes_used)', - '', - 9, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Logical Stored ', - 'Total of client data stored in the cluster', - 'current', - 'sum(ceph_pool_stored)', - '', - 12, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Compression Savings', - 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression', - 'current', - 'sum(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used)', - '', - 15, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'percent', - 'Compression Eligibility', - 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data\n', - 'current', - '(sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_stored_raw and ceph_pool_compress_under_bytes > 0)) * 100', - 'table', - 18, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'none', - 'Compression Factor', - 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)', - 'current', - 'sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_compress_bytes_used > 0)', - '', - 21, 0, 3, 3 - ), - addTableSchema( - '$datasource', '', {"col": 5,"desc": true}, [PoolOverviewStyle('', 'Time', 'hidden', 'short', null, [], []),PoolOverviewStyle('', 'instance', 'hidden', 'short', null, [], []),PoolOverviewStyle('', 'job', 'hidden', 'short', null, [], []),PoolOverviewStyle('Pool Name', 'name', 'string', 'short', null, [], []),PoolOverviewStyle('Pool ID', 'pool_id', 'hidden', 'none', null, [], []),PoolOverviewStyle('Compression Factor', 'Value #A', 'number', 'none', null, [], []),PoolOverviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70','85'], []),PoolOverviewStyle('Usable Free', 'Value #B', 'number', 'bytes', null, [], []),PoolOverviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent', null, [], []),PoolOverviewStyle('Compression Savings', 'Value #E', 'number', 'bytes', null, [], []),PoolOverviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0'], []),PoolOverviewStyle('IOPS', 'Value #G', 'number', 'none', null, [], []),PoolOverviewStyle('Bandwidth', 'Value #H', 'number', 'Bps', null, [], []),PoolOverviewStyle('', '__name__', 'hidden', 'short', null, [], []),PoolOverviewStyle('', 'type', 'hidden', 'short', null, [], []),PoolOverviewStyle('', 'compression_mode', 'hidden', 'short', null, [], []),PoolOverviewStyle('Type', 'description', 'string', 'short', null, [], []),PoolOverviewStyle('Stored', 'Value #J', 'number', 'bytes', null, [], []),PoolOverviewStyle('', 'Value #I', 'hidden', 'short', null, [], []),PoolOverviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{"text": "ON","value": "1"}])], 'Pool Overview', 'table' - ) - .addTargets( - [addTargetSchema('(ceph_pool_compress_under_bytes / ceph_pool_compress_bytes_used > 0) and on(pool_id) (((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100 > 0.5)', 1, 'table', 'A'), - addTargetSchema('ceph_pool_max_avail * on(pool_id) group_left(name) ceph_pool_metadata', 1, 'table', 'B'), - addTargetSchema('((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100', 1, 'table', 'C'), - addTargetSchema('(ceph_pool_percent_used * on(pool_id) group_left(name) ceph_pool_metadata)', 1, 'table', 'D'), - addTargetSchema('(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used > 0)', 1, 'table', 'E'), - addTargetSchema('delta(ceph_pool_stored[5d])', 1, 'table', 'F'), - addTargetSchema('rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])', 1, 'table', 'G'), - addTargetSchema('rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])', 1, 'table', 'H'), - addTargetSchema('ceph_pool_metadata', 1, 'table', 'I'), - addTargetSchema('ceph_pool_stored * on(pool_id) group_left ceph_pool_metadata', 1, 'table', 'J'), - addTargetSchema('ceph_pool_metadata{compression_mode!=\"none\"}', 1, 'table', 'K'), - addTargetSchema('', '', '', 'L')] - ) + {gridPos: {x: 0, y: 3, w: 24, h: 6}}, - PoolOverviewGraphPanel( - 'Top $topk Client IOPS by Pool', - 'This chart shows the sum of read and write IOPS from all clients by pool', - 'short', - 'IOPS', - 'topk($topk,round((rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])),1) * on(pool_id) group_left(instance,name) ceph_pool_metadata) ', - 'time_series', - '{{name}} ', - 0, 9, 12, 8 - ) - .addTarget( - addTargetSchema( - 'topk($topk,rate(ceph_pool_wr[30s]) + on(pool_id) group_left(instance,name) ceph_pool_metadata) ', - 1, - 'time_series', - '{{name}} - write') - ), - PoolOverviewGraphPanel( - 'Top $topk Client Bandwidth by Pool', - 'The chart shows the sum of read and write bytes from all clients, by pool', - 'Bps', - 'Throughput', - 'topk($topk,(rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])) * on(pool_id) group_left(instance,name) ceph_pool_metadata)', - 'time_series', - '{{name}}', - 12, 9, 12, 8 - ), - PoolOverviewGraphPanel( - 'Pool Capacity Usage (RAW)', - 'Historical view of capacity usage, to help identify growth and trends in pool consumption', - 'bytes', - 'Capacity Used', - 'ceph_pool_bytes_used * on(pool_id) group_right ceph_pool_metadata', - '', - '{{name}}', - 0, 17, 24, 7 - ) - ]) -} -{ - "pool-detail.json": - local PoolDetailSingleStatPanel(format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(['#299c46','rgba(237, 129, 40, 0.89)','#d44a3a'], '$datasource', format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds) - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - local PoolDetailGraphPanel(alias, title, description, formatY1, labelY1, expr, targetFormat, legendFormat, x, y, w, h) = - graphPanelSchema(alias, title, description, 'null as zero', false, formatY1, 'short', labelY1, null, null, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'Ceph Pool Details', '', '-xyV8KCiz', 'now-1h', '15s', 22, [], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'Prometheus admin.virt1.home.fajerski.name:9090', label='Data Source') - ) - .addTemplate( - addTemplateSchema('pool_name', '$datasource', 'label_values(ceph_pool_metadata,name)', 1, false, 1, 'Pool Name', '') - ) - .addPanels([ - PoolDetailSingleStatPanel( - 'percentunit', - 'Capacity used', - '', - 'current', - true, 1, true, true, - '.7,.8', - '(ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 0, 0, 7, 7 - ), - PoolDetailSingleStatPanel( - 's', - 'Time till full', - 'Time till pool is full assuming the average fill rate of the last 6 hours', - false, 100, false, false, - '', - 'current', - '(ceph_pool_max_avail / deriv(ceph_pool_stored[6h])) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} > 0', - 'time_series', - 7, 0, 5, 7 - ), - PoolDetailGraphPanel( - {"read_op_per_sec": "#3F6833","write_op_per_sec": "#E5AC0E"}, - '$pool_name Object Ingress/Egress', - '', - 'ops', - 'Objects out(-) / in(+) ', - 'deriv(ceph_pool_objects[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 'Objects per second', - 12, 0, 12, 7 - ), - PoolDetailGraphPanel( - {"read_op_per_sec": "#3F6833","write_op_per_sec": "#E5AC0E"}, - '$pool_name Client IOPS', - '', - 'iops', - 'Read (-) / Write (+)', - 'irate(ceph_pool_rd[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 'reads', - 0, 7, 12, 7 - ) - .addSeriesOverride({"alias": "reads","transform": "negative-Y"}) - .addTarget( - addTargetSchema( - 'irate(ceph_pool_wr[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 1, - 'time_series', - 'writes' - ) - ), - PoolDetailGraphPanel( - {"read_op_per_sec": "#3F6833","write_op_per_sec": "#E5AC0E"}, - '$pool_name Client Throughput', - '', - 'Bps', 'Read (-) / Write (+)', - 'irate(ceph_pool_rd_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 'reads', - 12, 7, 12, 7 - ) - .addSeriesOverride({"alias": "reads","transform": "negative-Y"}) - .addTarget( - addTargetSchema( - 'irate(ceph_pool_wr_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 1, - 'time_series', - 'writes' - ) - ), - PoolDetailGraphPanel( - {"read_op_per_sec": "#3F6833","write_op_per_sec": "#E5AC0E"}, - '$pool_name Objects', - '', - 'short', - 'Objects', - 'ceph_pool_objects * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 'Number of Objects', - 0, 14, 12, 7 - ) - ]) -} -{ - "osds-overview.json": - local OsdOverviewStyle(alias, pattern, type, unit) = - addStyle(alias, null, ["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []); - local OsdOverviewGraphPanel(alias, title, description, formatY1, labelY1, min, expr, legendFormat1, x, y, w, h) = - graphPanelSchema(alias, title, description, 'null', false, formatY1, 'short', labelY1, null, min, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat1)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - local OsdOverviewPieChartPanel(alias, description, title) = - addPieChartSchema(alias, '$datasource', description, 'Under graph', 'pie', title, 'current'); - local OsdOverviewSingleStatPanel(colors, format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(colors, '$datasource', format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds) - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'OSD Overview', '', 'lo02I1Aiz', 'now-1h', '10s', 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='table', name='Table', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addPanels([ - OsdOverviewGraphPanel( - {"@95%ile": "#e0752d"}, - 'OSD Read Latencies', - '', - 'ms', - null, - '0', - 'avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', - 'AVG read', - 0, 0, 8, 8 - ) - .addTargets( - [ - addTargetSchema( - 'max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', - 1, - 'time_series', - 'MAX read'), - addTargetSchema( - 'quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)', - 1, - 'time_series', - '@95%ile' - )], - ), - addTableSchema( - '$datasource', - 'This table shows the osd\'s that are delivering the 10 highest read latencies within the cluster', - {"col": 2,"desc": true}, - [OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),OsdOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest READ Latencies', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n ))\n)\n\n', - 1, - 'table', - '' - ) - ) + {gridPos: {x: 8, y: 0, w: 4, h: 8}}, - OsdOverviewGraphPanel( - {"@95%ile write": "#e0752d"}, - 'OSD Write Latencies', - '', - 'ms', - null, - '0', - 'avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)', - 'AVG write', - 12, 0, 8, 8 - ) - .addTargets( - [ - addTargetSchema( - 'max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)', - 1, - 'time_series', - 'MAX write' - ), - addTargetSchema( - 'quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)', - 1, - 'time_series', - '@95%ile write' - )], - ), - addTableSchema( - '$datasource', - 'This table shows the osd\'s that are delivering the 10 highest write latencies within the cluster', - {"col": 2,"desc": true}, - [OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),OsdOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest WRITE Latencies', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n ))\n)\n\n', - 1, - 'table', - '' - ) - ) + {gridPos: {x: 20, y: 0, w: 4, h: 8}}, - OsdOverviewPieChartPanel( - {}, - '', - 'OSD Types Summary' - ) - .addTarget(addTargetSchema('count by (device_class) (ceph_osd_metadata)', 1, 'time_series', '{{device_class}}')) + {gridPos: {x: 0, y: 8, w: 4, h: 8}}, - OsdOverviewPieChartPanel( - {"Non-Encrypted": "#E5AC0E"}, - '', - 'OSD Objectstore Types' - ) - .addTarget(addTargetSchema('count(ceph_bluefs_wal_total_bytes)', 1, 'time_series', 'bluestore')) - .addTarget(addTargetSchema('absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)', 1, 'time_series', 'filestore')) + {gridPos: {x: 4, y: 8, w: 4, h: 8}}, - OsdOverviewPieChartPanel( - {}, - 'The pie chart shows the various OSD sizes used within the cluster', - 'OSD Size Summary' - ) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes < 1099511627776)', 1, 'time_series', '<1TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)', 1, 'time_series', '<2TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)', 1, 'time_series', '<3TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)', 1, 'time_series', '<4TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)', 1, 'time_series', '<6TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)', 1, 'time_series', '<8TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)', 1, 'time_series', '<10TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)', 1, 'time_series', '<12TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 13194139533312)', 1, 'time_series', '<12TB+')) + {gridPos: {x: 8, y: 8, w: 4, h: 8}}, - g.graphPanel.new(bars=true, datasource='$datasource', title='Distribution of PGs per OSD', x_axis_buckets=20, x_axis_mode='histogram', x_axis_values=['total'], formatY1='short', formatY2='short', labelY1='# of OSDs', min='0', nullPointMode='null') - .addTarget(addTargetSchema('ceph_osd_numpg\n', 1, 'time_series', 'PGs per OSD')) + {gridPos: {x: 12, y: 8, w: 8, h: 8}}, - OsdOverviewSingleStatPanel( - ['#d44a3a', '#299c46'], - 'percentunit', - 'OSD onode Hits Ratio', - 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster', - 'current', - true, 1, true, false, - '.75', - 'sum(ceph_bluestore_onode_hits)/(sum(ceph_bluestore_onode_hits) + sum(ceph_bluestore_onode_misses))', - 'time_series', - 20, 8, 4, 8 - ), - addRowSchema(false, true, 'R/W Profile') + {gridPos: {x: 0, y: 16, w: 24, h: 1}}, - OsdOverviewGraphPanel( - {}, - 'Read/Write Profile', - 'Show the read/write workload profile overtime', - 'short', - null, - null, - 'round(sum(irate(ceph_pool_rd[30s])))', - 'Reads', - 0, 17, 24, 8 - ) - .addTargets([addTargetSchema('round(sum(irate(ceph_pool_wr[30s])))', 1, 'time_series', 'Writes')]) - ]) -} -{ - "osd-device-details.json": - local OsdDeviceDetailsPanel(title, description, formatY1, labelY1, expr1, expr2, legendFormat1, legendFormat2, x, y, w, h) = - graphPanelSchema({}, title, description, 'null', false, formatY1, 'short', labelY1, null, null, 1, '$datasource') - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', legendFormat1),addTargetSchema(expr2, 1, 'time_series', legendFormat2)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'OSD device details', '', 'CrAHE0iZz', 'now-3h', '', 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('osd', '$datasource', 'label_values(ceph_osd_metadata,ceph_daemon)', 1, false, 1, 'OSD', '(.*)') - ) - .addPanels([ - addRowSchema(false, true, 'OSD Performance') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - OsdDeviceDetailsPanel( - '$osd Latency', - '', - 's', - 'Read (-) / Write (+)', - 'irate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])', - 'irate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])', - 'read', - 'write', - 0, 1, 6, 9 - ) - .addSeriesOverride({"alias": "read","transform": "negative-Y"} - ), - OsdDeviceDetailsPanel( - '$osd R/W IOPS', - '', - 'short', - 'Read (-) / Write (+)', - 'irate(ceph_osd_op_r{ceph_daemon=~"$osd"}[1m])', - 'irate(ceph_osd_op_w{ceph_daemon=~"$osd"}[1m])', - 'Reads', - 'Writes', - 6, 1, 6, 9 - ) - .addSeriesOverride({"alias": "Reads","transform": "negative-Y"} - ), - OsdDeviceDetailsPanel( - '$osd R/W Bytes', - '', - 'bytes', - 'Read (-) / Write (+)', - 'irate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd"}[1m])', - 'irate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd"}[1m])', - 'Read Bytes', - 'Write Bytes', - 12, 1, 6, 9 - ) - .addSeriesOverride({"alias": "Read Bytes","transform": "negative-Y"}), - addRowSchema(false, true, 'Physical Device Performance') + {gridPos: {x: 0, y: 10, w: 24, h: 1}}, - OsdDeviceDetailsPanel( - 'Physical Device Latency for $osd', - '', - 's', - 'Read (-) / Write (+)', - '(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))', - '(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))', - '{{instance}}/{{device}} Reads', - '{{instance}}/{{device}} Writes', - 0, 11, 6, 9 - ) - .addSeriesOverride({"alias": "/.*Reads/","transform": "negative-Y"} - ), - OsdDeviceDetailsPanel( - 'Physical Device R/W IOPS for $osd', - '', - 'short', - 'Read (-) / Write (+)', - 'label_replace(irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - 'label_replace(irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{device}} on {{instance}} Writes', - '{{device}} on {{instance}} Reads', - 6, 11, 6, 9 - ) - .addSeriesOverride({"alias": "/.*Reads/","transform": "negative-Y"} - ), - OsdDeviceDetailsPanel( - 'Physical Device R/W Bytes for $osd', - '', - 'Bps', - 'Read (-) / Write (+)', - 'label_replace(irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - 'label_replace(irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{instance}} {{device}} Reads', - '{{instance}} {{device}} Writes', - 12, 11, 6, 9 - ) - .addSeriesOverride({"alias": "/.*Reads/","transform": "negative-Y"} - ), - graphPanelSchema( - {}, - 'Physical Device Util% for $osd', - '', - 'null', - false, - 'percentunit', - 'short', - null, null, null, - 1, - '$datasource' - ) - .addTarget( - addTargetSchema( - 'label_replace(irate(node_disk_io_time_seconds_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - 1, - 'time_series', - '{{device}} on {{instance}}' - )) + {gridPos: {x: 18, y: 11, w: 6, h: 9}}, - ]) -} -{ - "cephfs-overview.json": - local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) = - graphPanelSchema({}, title, '', 'null', false, formatY1, 'short', labelY1, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'MDS Performance', '', 'tbO9LAiZz', 'now-1h', '15s', 16, [], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('mds_servers', '$datasource', 'label_values(ceph_mds_inodes, ceph_daemon)', 1, true, 1, 'MDS Server', '') - ) - .addPanels([ - addRowSchema(false, true, 'MDS Performance') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - CephfsOverviewGraphPanel( - 'MDS Workload - $mds_servers', - 'none', - 'Reads(-) / Writes (+)', - 'sum(rate(ceph_objecter_op_r{ceph_daemon=~"($mds_servers).*"}[1m]))', - 'Read Ops', - 0, 1, 12, 9 - ) - .addTarget(addTargetSchema('sum(rate(ceph_objecter_op_w{ceph_daemon=~"($mds_servers).*"}[1m]))', 1, 'time_series', 'Write Ops')) - .addSeriesOverride({"alias": "/.*Reads/","transform": "negative-Y"} - ), - CephfsOverviewGraphPanel( - 'Client Request Load - $mds_servers', - 'none', - 'Client Requests', - 'ceph_mds_server_handle_client_request{ceph_daemon=~"($mds_servers).*"}', - '{{ceph_daemon}}', - 12, 1, 12, 9 - ) - ]) -} diff --git a/monitoring/grafana/dashboards/test-jsonnet.sh b/monitoring/grafana/dashboards/test-jsonnet.sh deleted file mode 100644 index 127992c057554..0000000000000 --- a/monitoring/grafana/dashboards/test-jsonnet.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -set -e -TEMPDIR=`mktemp -d` -BASEDIR=$(dirname "$0") - -JSONNET_PATH="${GRAFONNET_PATH}" jsonnet -m ${TEMPDIR} $BASEDIR/jsonnet/grafana_dashboards.jsonnet - -truncate -s 0 ${TEMPDIR}/json_difference.log -for json_files in $BASEDIR/*.json -do - JSON_FILE_NAME=$(basename $json_files) - for generated_files in ${TEMPDIR}/*.json - do - GENERATED_FILE_NAME=$(basename $generated_files) - if [ $JSON_FILE_NAME == $GENERATED_FILE_NAME ]; then - jsondiff --indent 2 $generated_files $json_files | tee -a ${TEMPDIR}/json_difference.log - fi - done -done - -if [[ $(wc -l < ${TEMPDIR}/json_difference.log) -eq 0 ]] -then - rm -rf ${TEMPDIR} - echo "Congratulations! Grafonnet Check Passed" -else - rm -rf ${TEMPDIR} - echo "Grafonnet Check Failed, failed comparing generated file with existing" - exit 1 -fi diff --git a/monitoring/grafana/dashboards/tox.ini b/monitoring/grafana/dashboards/tox.ini deleted file mode 100644 index c489a7897113f..0000000000000 --- a/monitoring/grafana/dashboards/tox.ini +++ /dev/null @@ -1,45 +0,0 @@ -[tox] -envlist = grafonnet-{check,fix},lint,promql-query-test -skipsdist = true - -[grafonnet] -deps = - -rrequirements-grafonnet.txt - -[testenv:grafonnet-{check,fix}] -basepython = python3 -whitelist_externals = - jsonnet - bash -description = - check: Ensure that auto-generated grafana dashboard files matches the current version - fix: generate dashboard json files from jsonnet file with latest changes -deps = - {[grafonnet]deps} -passenv = GRAFONNET_PATH -commands = - check: bash test-jsonnet.sh - fix: jsonnet -m . jsonnet/grafana_dashboards.jsonnet - - -[testenv:lint] -description = - Run linters -deps = - -rrequirements-lint.txt -setenv = -commands = - pylint --rcfile=.pylintrc tests - mypy tests - isort tests - -[testenv:promql-query-test] -description = - Run promtool unit testing on grafana queries. -deps = - -rtests/requirements.txt -depends = grafonnet-check -setenv = -commands = - python -m doctest tests/util.py - behave tests/features diff --git a/monitoring/prometheus/CMakeLists.txt b/monitoring/prometheus/CMakeLists.txt deleted file mode 100644 index 88c05163602af..0000000000000 --- a/monitoring/prometheus/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(tests) diff --git a/monitoring/prometheus/README.md b/monitoring/prometheus/README.md deleted file mode 100644 index 7ec8f309adcf5..0000000000000 --- a/monitoring/prometheus/README.md +++ /dev/null @@ -1,14 +0,0 @@ -## Prometheus related bits - -### Alerts -In monitoring/prometheus/alerts you'll find a set of Prometheus alert rules that -should provide a decent set of default alerts for a Ceph cluster. Just put this -file in a place according to your Prometheus configuration (wherever the `rules` -configuration stanza points). - -### SNMP -Ceph provides a MIB (CEPH-PROMETHEUS-ALERT-MIB.txt) to support sending Prometheus -alerts through to an SNMP management platform. The translation from Prometheus -alert to SNMP trap requires the Prometheus alert to contain an OID that maps to -a definition within the MIB. When making changes to the Prometheus alert rules -file, developers should include any necessary changes to the MIB. diff --git a/monitoring/prometheus/tests/CMakeLists.txt b/monitoring/prometheus/tests/CMakeLists.txt deleted file mode 100644 index 15fce8e1e02b7..0000000000000 --- a/monitoring/prometheus/tests/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if(WITH_TESTS) - include(AddCephTest) - # add_tox_test(prometheus-alerts ${CMAKE_CURRENT_SOURCE_DIR} TOX_ENVS py3) -endif() diff --git a/monitoring/prometheus/tests/settings.py b/monitoring/prometheus/tests/settings.py deleted file mode 100644 index c54f141a3edf6..0000000000000 --- a/monitoring/prometheus/tests/settings.py +++ /dev/null @@ -1,2 +0,0 @@ -ALERTS_FILE = '../alerts/ceph_default_alerts.yml' -UNIT_TESTS_FILE = 'test_alerts.yml' \ No newline at end of file diff --git a/monitoring/prometheus/tests/tox.ini b/monitoring/prometheus/tests/tox.ini deleted file mode 100644 index b96390160b70f..0000000000000 --- a/monitoring/prometheus/tests/tox.ini +++ /dev/null @@ -1,11 +0,0 @@ -[tox] -envlist = py3 -skipsdist = true - -[testenv] -deps = - -rrequirements.txt - pytest -commands = - pytest -rA test_syntax.py test_unittests.py - ./validate_rules.py diff --git a/src/pybind/mgr/dashboard/grafana.py b/src/pybind/mgr/dashboard/grafana.py index 6e3ae5a221bfc..8edf9c57d0efb 100644 --- a/src/pybind/mgr/dashboard/grafana.py +++ b/src/pybind/mgr/dashboard/grafana.py @@ -104,7 +104,7 @@ def load_local_dashboards(): if os.environ.get('CEPH_DEV') == '1' or 'UNITTEST' in os.environ: path = os.path.abspath(os.path.join( os.path.dirname(__file__), - '../../../../monitoring/grafana/dashboards/' + '../../../../monitoring/ceph-mixin/dashboards_out/' )) else: path = '/etc/grafana/dashboards/ceph-dashboard' diff --git a/src/pybind/mgr/dashboard/tox.ini b/src/pybind/mgr/dashboard/tox.ini index 21b831f90143f..f412c0bdf18e9 100644 --- a/src/pybind/mgr/dashboard/tox.ini +++ b/src/pybind/mgr/dashboard/tox.ini @@ -153,7 +153,7 @@ commands = [testenv:check] commands = - python ci/check_grafana_dashboards.py frontend/src/app ../../../../monitoring/grafana/dashboards + python ci/check_grafana_dashboards.py frontend/src/app ../../../../monitoring/ceph-mixin/dashboards_out [testenv:openapi-{check,fix}] basepython = python3 diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index bd8c3b9bc8eae..54c49970ebc67 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -610,27 +610,6 @@ add_ceph_test(run-cli-tests ${CMAKE_CURRENT_SOURCE_DIR}/run-cli-tests) add_ceph_test(smoke.sh ${CMAKE_CURRENT_SOURCE_DIR}/smoke.sh) -if(DEFINED PROMTOOL_EXECUTABLE) - set(promtool_executable_checked TRUE) -endif() - -find_program(PROMTOOL_EXECUTABLE promtool) -if(PROMTOOL_EXECUTABLE) - execute_process( - COMMAND ${PROMTOOL_EXECUTABLE} test rules /dev/null - RESULT_VARIABLE rc - OUTPUT_QUIET) - if(NOT rc) - add_ceph_test(run-promtool-unittests - ${PROMTOOL_EXECUTABLE} test rules ${CMAKE_SOURCE_DIR}/monitoring/prometheus/tests/test_alerts.yml) - elseif(NOT promtool_executable_checked) - message(WARNING "'${PROMTOOL_EXECUTABLE} test rules' does not work, " - "please use a newer prometheus") - endif() -elseif(NOT promtool_executable_checked) - message(WARNING "run-promtool-unittests is skipped due to missing promtool") -endif() - set_property( TEST ${tox_tests} PROPERTY ENVIRONMENT ${env_vars_for_tox_tests})