From 2db1aaabe5f4627bb7b177ab3441593f08aa7cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Knecht?= Date: Thu, 28 Oct 2021 18:49:07 +0200 Subject: [PATCH 1/5] mgr/ActivePyModules: Add metadata id in dump_server() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `DaemonStateCollection` used to always contain the daemon name in its `DaemonKey`, but since #40220 (or more specifically afc33758e076761b8d4ec004c8f9c49b80a48770), the RadosGW registers with its instance ID instead (`rados.get_instance_id()`). As a result, the `ceph_rgw_*` metrics returned by `ceph-mgr` through the `prometheus` module have their `ceph_daemon` label include that ID instead of the daemon name, e.g. ``` ceph_rgw_req{ceph_daemon="rgw.127202"} ``` instead of ``` ceph_rgw_req{ceph_daemon="rgw.my-hostname.rgw0"} ``` This commit adds the daemon name from `state->metadata["id"]` if available, as `service.name` in the JSON document returned by `dump_server()`. Signed-off-by: Benoît Knecht --- src/mgr/ActivePyModules.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc index 808daaf7f7742..036348dc18d8c 100644 --- a/src/mgr/ActivePyModules.cc +++ b/src/mgr/ActivePyModules.cc @@ -76,7 +76,8 @@ void ActivePyModules::dump_server(const std::string &hostname, std::string ceph_version; for (const auto &[key, state] : dmc) { - without_gil([&ceph_version, state=state] { + std::string id; + without_gil([&ceph_version, &id, state=state] { std::lock_guard l(state->lock); // TODO: pick the highest version, and make sure that // somewhere else (during health reporting?) we are @@ -85,10 +86,16 @@ void ActivePyModules::dump_server(const std::string &hostname, if (ver_iter != state->metadata.end()) { ceph_version = state->metadata.at("ceph_version"); } + if (state->metadata.find("id") != state->metadata.end()) { + id = state->metadata.at("id"); + } }); f->open_object_section("service"); f->dump_string("type", key.type); f->dump_string("id", key.name); + if (!id.empty()) { + f->dump_string("name", id); + } f->close_section(); } f->close_section(); From 9f6573bbb1eeed8fab117149a117fdffd56bdf64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Knecht?= Date: Fri, 12 Nov 2021 15:07:35 +0100 Subject: [PATCH 2/5] pybind/mgr/mgr_module.py: Set instance_id label for rgw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the RadosGW returns its instance ID instead of its daemon name, replace the `ceph_daemon` label with an `instance_id` label on the `rgw` metrics. Signed-off-by: Benoît Knecht --- src/pybind/mgr/mgr_module.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 657cf2cceffe4..d8d19a7b0d48f 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -1296,7 +1296,13 @@ def _stattype_to_str(self, stattype: int) -> str: def _perfpath_to_path_labels(self, daemon: str, path: str) -> Tuple[str, Tuple[str, ...], Tuple[str, ...]]: - label_names = ("ceph_daemon",) # type: Tuple[str, ...] + if daemon.startswith('rgw.'): + label_name = 'instance_id' + daemon = daemon[len('rgw.'):] + else: + label_name = 'ceph_daemon' + + label_names = (label_name,) # type: Tuple[str, ...] labels = (daemon,) # type: Tuple[str, ...] if daemon.startswith('rbd-mirror.'): From 01b42c1c51a1b4142adcde0c2c673b60e61e4697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Knecht?= Date: Fri, 12 Nov 2021 15:10:52 +0100 Subject: [PATCH 3/5] pybind/mgr/prometheus: Add instance_id metadata for rgw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to get the `ceph_daemon` label for `rgw` metrics corresponding to the value before #40220, we need to add the `instance_id` label to the `ceph_rgw_metadata` metric. This way, the old `ceph_daemon` label can be added to any `ceph_rgw_*` metric using the following PromQL query, for instance: ``` ceph_rgw_req * on (instance_id) group_left(ceph_daemon) ceph_rgw_metadata ``` Signed-off-by: Benoît Knecht --- src/pybind/mgr/prometheus/module.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index e585c8752e374..91f3528f79a5d 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -102,7 +102,7 @@ def health_status_to_number(status: str) -> int: POOL_METADATA = ('pool_id', 'name', 'type', 'description', 'compression_mode') -RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') +RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version', 'instance_id') RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname', 'ceph_version') @@ -808,7 +808,7 @@ def get_fs(self) -> None: # export standby mds metadata, default standby fs_id is '-1' for standby in fs_map['standbys']: id_ = standby['name'] - host, version = servers.get((id_, 'mds'), ('', '')) + host, version, _ = servers.get((id_, 'mds'), ('', '', '')) addr, rank = standby['addr'], standby['rank'] self.metrics['mds_metadata'].set(1, ( 'mds.{}'.format(id_), '-1', @@ -830,7 +830,7 @@ def get_fs(self) -> None: self.log.debug('mdsmap: {}'.format(fs['mdsmap'])) for gid, daemon in fs['mdsmap']['info'].items(): id_ = daemon['name'] - host, version = servers.get((id_, 'mds'), ('', '')) + host, version, _ = servers.get((id_, 'mds'), ('', '', '')) self.metrics['mds_metadata'].set(1, ( 'mds.{}'.format(id_), fs['id'], host, daemon['addr'], @@ -844,7 +844,7 @@ def get_quorum_status(self) -> None: for mon in mon_status['monmap']['mons']: rank = mon['rank'] id_ = mon['name'] - host_version = servers.get((id_, 'mon'), ('', '')) + host_version = servers.get((id_, 'mon'), ('', '', '')) self.metrics['mon_metadata'].set(1, ( 'mon.{}'.format(id_), host_version[0], mon['public_addr'].rsplit(':', 1)[0], rank, @@ -870,7 +870,7 @@ def get_mgr_status(self) -> None: for module in mgr_map['available_modules']} for mgr in all_mgrs: - host, version = servers.get((mgr, 'mgr'), ('', '')) + host, version, _ = servers.get((mgr, 'mgr'), ('', '', '')) if mgr == active: _state = 1 else: @@ -928,13 +928,13 @@ def get_osd_stats(self) -> None: 'osd.{}'.format(id_), )) - def get_service_list(self) -> Dict[Tuple[str, str], Tuple[str, str]]: + def get_service_list(self) -> Dict[Tuple[str, str], Tuple[str, str, str]]: ret = {} for server in self.list_servers(): version = cast(str, server.get('ceph_version', '')) host = cast(str, server.get('hostname', '')) for service in cast(List[ServiceInfoT], server.get('services', [])): - ret.update({(service['id'], service['type']): (host, version)}) + ret.update({(service['id'], service['type']): (host, version, service.get('name', ''))}) return ret @profile_method() @@ -972,7 +972,7 @@ def get_metadata_and_osd_status(self) -> None: "skipping output".format(id_)) continue - host_version = servers.get((str(id_), 'osd'), ('', '')) + host_version = servers.get((str(id_), 'osd'), ('', '', '')) # collect disk occupation metadata osd_metadata = self.get_metadata("osd", str(id_)) @@ -1086,11 +1086,11 @@ def _get_pool_info(pool: Dict[str, Any]) -> Tuple[str, str]: for key, value in servers.items(): service_id, service_type = key if service_type == 'rgw': - hostname, version = value + hostname, version, name = value self.metrics['rgw_metadata'].set( 1, - ('{}.{}'.format(service_type, service_id), - hostname, version) + ('{}.{}'.format(service_type, name), + hostname, version, service_id) ) elif service_type == 'rbd-mirror': mirror_metadata = self.get_metadata('rbd-mirror', service_id) From adc36dea7fc586c4d882462fbd3ab52006402b8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Knecht?= Date: Mon, 3 Jan 2022 16:18:39 +0100 Subject: [PATCH 4/5] monitoring/grafana: Update radosgw dashboards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the `ceph_daemon` label now replaced by `instance_id` on all `ceph_rgw_*` metrics, we need to update Grafana dashboards get that label back from `ceph_rgw_metadata` using this type of construct: ``` ceph_rgw_req * on (instance_id) group_left(ceph_daemon) ceph_rgw_metadata ``` Signed-off-by: Benoît Knecht --- .../grafana/dashboards/hosts-overview.json | 2 +- .../jsonnet/grafana_dashboards.jsonnet | 44 +++++++++---------- .../grafana/dashboards/radosgw-detail.json | 26 +++++------ .../grafana/dashboards/radosgw-overview.json | 14 +++--- .../tests/features/radosgw_overview.feature | 21 +++++---- 5 files changed, 55 insertions(+), 52 deletions(-) diff --git a/monitoring/grafana/dashboards/hosts-overview.json b/monitoring/grafana/dashboards/hosts-overview.json index 115c1824974fe..91369b56e842d 100644 --- a/monitoring/grafana/dashboards/hosts-overview.json +++ b/monitoring/grafana/dashboards/hosts-overview.json @@ -796,7 +796,7 @@ "multi": false, "name": "rgw_hosts", "options": [ ], - "query": "label_values(ceph_rgw_qlen, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata, ceph_daemon)", "refresh": 1, "regex": "rgw.(.*)", "sort": 1, diff --git a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet b/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet index d9deca18cc5f4..270d488e0ec19 100644 --- a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet +++ b/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet @@ -70,7 +70,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt addTemplateSchema('mds_hosts', '$datasource', 'label_values(ceph_mds_inodes, ceph_daemon)', 1, true, 1, null, 'mds.(.*)') ) .addTemplate( - addTemplateSchema('rgw_hosts', '$datasource', 'label_values(ceph_rgw_qlen, ceph_daemon)', 1, true, 1, null, 'rgw.(.*)') + addTemplateSchema('rgw_hosts', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, null, 'rgw.(.*)') ) .addPanels([ HostsOverviewSingleStatPanel( @@ -450,7 +450,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt type='panel', id='graph', name='Graph', version='5.0.0' ) .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_req, ceph_daemon)', 1, true, 1, '', '') + addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, '', '') ) .addTemplate( addTemplateSchema('code', '$datasource', 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)', 1, true, 1, 'HTTP Code', '') @@ -468,14 +468,14 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 's', 'short', - 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])', + 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', 'GET AVG', 0, 1, 8, 7 ) .addTargets( [ addTargetSchema( - 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])', + 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', 1, 'time_series', 'PUT AVG' @@ -485,7 +485,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 'none', 'short', - 'sum by(rgw_host) (label_replace(rate(ceph_rgw_req[30s]), "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))', + 'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))', '{{rgw_host}}', 8, 1, 7, 7 ), @@ -494,7 +494,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', 's', 'short', - 'label_replace(rate(ceph_rgw_get_initial_lat_sum[30s]),"rgw_host","$1","ceph_daemon","rgw.(.*)") / \nlabel_replace(rate(ceph_rgw_get_initial_lat_count[30s]),"rgw_host","$1","ceph_daemon","rgw.(.*)")', + 'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', '{{rgw_host}}', 15, 1, 6, 7 ), @@ -520,7 +520,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt 'Total bytes transferred in/out through get/put operations, by radosgw instance', 'bytes', 'short', - 'sum by(rgw_host) (\n (label_replace(rate(ceph_rgw_get_b[30s]), "rgw_host","$1","ceph_daemon","rgw.(.*)")) + \n (label_replace(rate(ceph_rgw_put_b[30s]), "rgw_host","$1","ceph_daemon","rgw.(.*)"))\n)', + 'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', '{{rgw_host}}', 8, 8, 7, 6 ), @@ -529,7 +529,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', 's', 'short', - 'label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),"rgw_host","$1","ceph_daemon","rgw.(.*)") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),"rgw_host","$1","ceph_daemon","rgw.(.*)")', + 'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', '{{rgw_host}}', 15, 8, 6, 6 ), @@ -659,7 +659,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') ) .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_req, ceph_daemon)', 1, true, 1, '', '') + addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, '', '') ) .addPanels([ addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, @@ -669,8 +669,8 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 's', 'short', - 'sum by (ceph_daemon) (rate(ceph_rgw_get_initial_lat_sum{ceph_daemon=~"($rgw_servers)"}[30s]) / rate(ceph_rgw_get_initial_lat_count{ceph_daemon=~"($rgw_servers)"}[30s]))', - 'sum by (ceph_daemon)(rate(ceph_rgw_put_initial_lat_sum{ceph_daemon=~"($rgw_servers)"}[30s]) / rate(ceph_rgw_put_initial_lat_count{ceph_daemon=~"($rgw_servers)"}[30s]))', + 'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 'GET {{ceph_daemon}}', 'PUT {{ceph_daemon}}', 0, 1, 6, 8 @@ -681,8 +681,8 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 'bytes', 'short', - 'rate(ceph_rgw_get_b{ceph_daemon=~"$rgw_servers"}[30s])', - 'rate(ceph_rgw_put_b{ceph_daemon=~"$rgw_servers"}[30s])', + 'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 'GETs {{ceph_daemon}}', 'PUTs {{ceph_daemon}}', 6, 1, 7, 8 @@ -693,8 +693,8 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 'short', 'short', - 'rate(ceph_rgw_failed_req{ceph_daemon=~"$rgw_servers"}[30s])', - 'rate(ceph_rgw_get{ceph_daemon=~"$rgw_servers"}[30s])', + 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 'Requests Failed {{ceph_daemon}}', 'GETs {{ceph_daemon}}', 13, 1, 7, 8 @@ -702,13 +702,13 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt .addTargets( [ addTargetSchema( - 'rate(ceph_rgw_put{ceph_daemon=~"$rgw_servers"}[30s])', + 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'PUTs {{ceph_daemon}}' ), addTargetSchema( - 'rate(ceph_rgw_req{ceph_daemon=~"$rgw_servers"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~"$rgw_servers"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~"$rgw_servers"}[30s]))', + '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Other {{ceph_daemon}}' @@ -722,10 +722,10 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt 'Workload Breakdown', 'current' ) - .addTarget(addTargetSchema('rate(ceph_rgw_failed_req{ceph_daemon=~"$rgw_servers"}[30s])', 1, 'time_series', 'Failures {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_get{ceph_daemon=~"$rgw_servers"}[30s])', 1, 'time_series', 'GETs {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_put{ceph_daemon=~"$rgw_servers"}[30s])', 1, 'time_series', 'PUTs {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_req{ceph_daemon=~"$rgw_servers"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~"$rgw_servers"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~"$rgw_servers"}[30s]))', 1, 'time_series', 'Other (DELETE,LIST) {{ceph_daemon}}')) + {gridPos: {x: 20, y: 1, w: 4, h: 8}} + .addTarget(addTargetSchema('rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Failures {{ceph_daemon}}')) + .addTarget(addTargetSchema('rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'GETs {{ceph_daemon}}')) + .addTarget(addTargetSchema('rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'PUTs {{ceph_daemon}}')) + .addTarget(addTargetSchema('(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Other (DELETE,LIST) {{ceph_daemon}}')) + {gridPos: {x: 20, y: 1, w: 4, h: 8}} ]) } { @@ -1171,7 +1171,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt addPieChartSchema(alias, '$datasource', description, 'Under graph', 'pie', title, 'current'); local OsdOverviewSingleStatPanel(colors, format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds, expr, targetFormat, x, y, w, h) = addSingelStatSchema(colors, '$datasource', format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds) - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; + .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; dashboardSchema( 'OSD Overview', '', 'lo02I1Aiz', 'now-1h', '10s', 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} diff --git a/monitoring/grafana/dashboards/radosgw-detail.json b/monitoring/grafana/dashboards/radosgw-detail.json index 432eecc837ca4..53486475cbb51 100644 --- a/monitoring/grafana/dashboards/radosgw-detail.json +++ b/monitoring/grafana/dashboards/radosgw-detail.json @@ -104,14 +104,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (ceph_daemon) (rate(ceph_rgw_get_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_get_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s]))", + "expr": "sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET {{ceph_daemon}}", "refId": "A" }, { - "expr": "sum by (ceph_daemon)(rate(ceph_rgw_put_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_put_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s]))", + "expr": "sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT {{ceph_daemon}}", @@ -196,14 +196,14 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_get_b{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_put_b{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", @@ -294,28 +294,28 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Requests Failed {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_get{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "B" }, { - "expr": "rate(ceph_rgw_put{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", "refId": "C" }, { - "expr": "rate(ceph_rgw_req{ceph_daemon=~\"$rgw_servers\"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"$rgw_servers\"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~\"$rgw_servers\"}[30s]))", + "expr": "(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Other {{ceph_daemon}}", @@ -384,28 +384,28 @@ "pieType": "pie", "targets": [ { - "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Failures {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_get{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "B" }, { - "expr": "rate(ceph_rgw_put{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", "refId": "C" }, { - "expr": "rate(ceph_rgw_req{ceph_daemon=~\"$rgw_servers\"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"$rgw_servers\"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~\"$rgw_servers\"}[30s]))", + "expr": "(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Other (DELETE,LIST) {{ceph_daemon}}", @@ -450,7 +450,7 @@ "multi": false, "name": "rgw_servers", "options": [ ], - "query": "label_values(ceph_rgw_req, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata, ceph_daemon)", "refresh": 1, "regex": "", "sort": 1, diff --git a/monitoring/grafana/dashboards/radosgw-overview.json b/monitoring/grafana/dashboards/radosgw-overview.json index 489f29a2fc783..7fe94138b1356 100644 --- a/monitoring/grafana/dashboards/radosgw-overview.json +++ b/monitoring/grafana/dashboards/radosgw-overview.json @@ -98,14 +98,14 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])", + "expr": "rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET AVG", "refId": "A" }, { - "expr": "rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])", + "expr": "rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT AVG", @@ -190,7 +190,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(rgw_host) (label_replace(rate(ceph_rgw_req[30s]), \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"))", + "expr": "sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -275,7 +275,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(rate(ceph_rgw_get_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_get_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")", + "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n\"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -452,7 +452,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(rgw_host) (\n (label_replace(rate(ceph_rgw_get_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")) + \n (label_replace(rate(ceph_rgw_put_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\"))\n)", + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -537,7 +537,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")", + "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n\"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -1093,7 +1093,7 @@ "multi": false, "name": "rgw_servers", "options": [ ], - "query": "label_values(ceph_rgw_req, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata, ceph_daemon)", "refresh": 1, "regex": "", "sort": 1, diff --git a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature b/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature index b77d56616bd9d..3e9724ee2dc26 100644 --- a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature +++ b/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature @@ -3,27 +3,30 @@ Feature: RGW Overview Dashboard Scenario: "Test Average GET Latencies" Given the following series: | metrics | values | - | ceph_rgw_get_initial_lat_sum{ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 10 50 100 | - | ceph_rgw_get_initial_lat_count{ceph_daemon="rgw.foo", instance="127.0.0.1", job="ceph"} | 20 60 80 | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | When interval is `30s` Then Grafana panel `Average GET/PUT Latencies` with legend `GET AVG` shows: | metrics | values | - | {ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 2.5000000000000004 | + | {ceph_daemon="rgw.foo",instance="127.0.0.1", instance_id="58892247", job="ceph"} | 2.5000000000000004 | Scenario: "Test Average PUT Latencies" Given the following series: | metrics | values | - | ceph_rgw_put_initial_lat_sum{ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 15 35 55 | - | ceph_rgw_put_initial_lat_count{ceph_daemon="rgw.foo", instance="127.0.0.1", job="ceph"} | 10 30 50 | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | When interval is `30s` Then Grafana panel `Average GET/PUT Latencies` with legend `PUT AVG` shows: | metrics | values | - | {ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 1 | + | {ceph_daemon="rgw.foo",instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 | Scenario: "Test Total Requests/sec by RGW Instance" Given the following series: | metrics | values | - | ceph_rgw_req{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 10 50 100 | + | ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | When interval is `30s` Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | @@ -32,7 +35,7 @@ Scenario: "Test Total Requests/sec by RGW Instance" Scenario: "Test Bandwidth Consumed by Type- GET" Given the following series: | metrics | values | - | ceph_rgw_get_b{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 10 50 100 | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | When evaluation time is `1m` And interval is `30s` Then Grafana panel `Bandwidth Consumed by Type` with legend `GETs` shows: @@ -42,7 +45,7 @@ Scenario: "Test Bandwidth Consumed by Type- GET" Scenario: "Test Bandwidth Consumed by Type- PUT" Given the following series: | metrics | values | - | ceph_rgw_put_b{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 5 20 50 | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | When evaluation time is `1m` And interval is `30s` Then Grafana panel `Bandwidth Consumed by Type` with legend `PUTs` shows: From 2daaa052ea82ff806a529402e802adbbbe9b4554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Knecht?= Date: Wed, 5 Jan 2022 19:12:55 +0100 Subject: [PATCH 5/5] monitoring/grafana: Add tests for radosgw panels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the expressions modified in c40290390d7 were not covered by any tests, especially those in the `radosgw-detail.json` dashboard. This commit fills in those gaps. Signed-off-by: Benoît Knecht --- .../tests/features/radosgw-detail.feature | 139 ++++++++++++++++++ .../tests/features/radosgw_overview.feature | 35 +++++ 2 files changed, 174 insertions(+) create mode 100644 monitoring/grafana/dashboards/tests/features/radosgw-detail.feature diff --git a/monitoring/grafana/dashboards/tests/features/radosgw-detail.feature b/monitoring/grafana/dashboards/tests/features/radosgw-detail.feature new file mode 100644 index 0000000000000..bcc793a21a55e --- /dev/null +++ b/monitoring/grafana/dashboards/tests/features/radosgw-detail.feature @@ -0,0 +1,139 @@ +Feature: RGW Host Detail Dashboard + +Scenario: "Test $rgw_servers GET/PUT Latencies - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `$rgw_servers GET/PUT Latencies` with legend `GET {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance_id="58892247"} | 2.5000000000000004 | + +Scenario: "Test $rgw_servers GET/PUT Latencies - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `$rgw_servers GET/PUT Latencies` with legend `PUT {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance_id="58892247"} | 1 | + +Scenario: "Test Bandwidth by HTTP Operation - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.1` + Then Grafana panel `Bandwidth by HTTP Operation` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1.6666666666666667 | + +Scenario: "Test Bandwidth by HTTP Operation - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.1` + Then Grafana panel `Bandwidth by HTTP Operation` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 | + +Scenario: "Test HTTP Request Breakdown - Requests Failed" + Given the following series: + | metrics | values | + | ceph_rgw_failed_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 5 7 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `Requests Failed {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 6.666666666666667e-02 | + +Scenario: "Test HTTP Request Breakdown - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | .6666666666666666 | + +Scenario: "Test HTTP Request Breakdown - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 2.3333333333333335 | + +Scenario: "Test HTTP Request Breakdown - Other" + Given the following series: + | metrics | values | + | ceph_rgw_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 175 250 345 | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `Other {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | .16666666666666652 | + +Scenario: "Test Workload Breakdown - Failures" + Given the following series: + | metrics | values | + | ceph_rgw_failed_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 5 7 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `Failures {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 6.666666666666667e-02 | + +Scenario: "Test Workload Breakdown - GETs" + Given the following series: + | metrics | values | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | .6666666666666666 | + +Scenario: "Test Workload Breakdown - PUTs" + Given the following series: + | metrics | values | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 2.3333333333333335 | + +Scenario: "Test Workload Breakdown - Other" + Given the following series: + | metrics | values | + | ceph_rgw_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 175 250 345 | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `Other (DELETE,LIST) {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | .16666666666666652 | diff --git a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature b/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature index 3e9724ee2dc26..69e46b1d511d7 100644 --- a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature +++ b/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature @@ -32,6 +32,17 @@ Scenario: "Test Total Requests/sec by RGW Instance" | metrics | values | | {rgw_host="1"} | 1.6666666666666667 | +Scenario: "Test GET Latencies by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `GET Latencies by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 2.5000000000000004 | + Scenario: "Test Bandwidth Consumed by Type- GET" Given the following series: | metrics | values | @@ -52,6 +63,30 @@ Scenario: "Test Bandwidth Consumed by Type- PUT" | metrics | values | | {} | 1 | +Scenario: "Test Bandwidth by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `Bandwidth by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance_id="92806566", rgw_host="1"} | 2.666666666666667 | + +Scenario: "Test PUT Latencies by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `PUT Latencies by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1 | + Scenario: "Test Total backend responses by HTTP code" Given the following series: | metrics | values |