From 53520112252917dd3b50b6e17bf2d6ec9780c24c Mon Sep 17 00:00:00 2001 From: xeniape Date: Wed, 15 Oct 2025 13:51:14 +0200 Subject: [PATCH 1/2] chore: ensure metrics are correctly exposed --- .../hbase/pages/usage-guide/monitoring.adoc | 19 +++- rust/operator-binary/src/crd/mod.rs | 30 +++++ rust/operator-binary/src/hbase_controller.rs | 104 ++++++++++++++++-- 3 files changed, 140 insertions(+), 13 deletions(-) diff --git a/docs/modules/hbase/pages/usage-guide/monitoring.adoc b/docs/modules/hbase/pages/usage-guide/monitoring.adoc index 2c9b7570..8391ec70 100644 --- a/docs/modules/hbase/pages/usage-guide/monitoring.adoc +++ b/docs/modules/hbase/pages/usage-guide/monitoring.adoc @@ -6,5 +6,20 @@ See xref:operators:monitoring.adoc[] for more details. Starting with HBase 2.6 the URL for Prometheus metrics has changed. This is because HBase offers now a built-in endpoint for this purpose. -This endpoint is available from the UI service. -For example, in the case of the master service, the URL is `http://:16010/prometheus`. +This endpoint is available from the `metrics` Services. +For example, in the case of the master Service, the URL is `http://-master--metrics:16010/prometheus`. + +== Authentication when using TLS + +HBase exposes metrics through the same port as their web UI. Hence, when configuring HBase with TLS the metrics are also secured by TLS, +and the clients scraping the metrics endpoint need to authenticate against it. This could for example be accomplished by utilizing mTLS +between Kubernetes Pods with the xref:home:secret-operator:index.adoc[Secret Operator]. + +When using Prometheus `ServiceMonitor` for scraping, the `address` label needs relabeling to use the `headless` Service instead of the +`metrics` Service. This is because per default Prometheus targets the Pod IPs as endpoints, but since the Pod IPs are not +part of the certificate, the authentication will fail. Instead, the FQDN of the Pods, which can be added to the certificate, is used, but +this FQDN is only available through the `headless` Service. + +A more detailed explanation can be found in the xref:home:nifi:usage_guide/monitoring.adoc[NiFi Operator Monitoring Docs] with a similar situation +and an example of a Prometheus `ServiceMonitor` configured for TLS in the +https://github.com/stackabletech/demos/blob/main/stacks/monitoring/prometheus-service-monitors.yaml[Monitoring Stack{external-link-icon}^]. diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 4a67187a..69d91b44 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -69,16 +69,20 @@ pub const HBASE_UI_PORT_NAME_HTTP: &str = "ui-http"; pub const HBASE_UI_PORT_NAME_HTTPS: &str = "ui-https"; pub const HBASE_REST_PORT_NAME_HTTP: &str = "rest-http"; pub const HBASE_REST_PORT_NAME_HTTPS: &str = "rest-https"; +pub const HBASE_METRICS_PORT_NAME: &str = "metrics"; pub const HBASE_MASTER_PORT: u16 = 16000; // HBase always uses 16010, regardless of http or https. On 2024-01-17 we decided in Arch-meeting that we want to stick // the port numbers to what the product is doing, so we get the least surprise for users - even when this means we have // inconsistency between Stackable products. pub const HBASE_MASTER_UI_PORT: u16 = 16010; +pub const HBASE_MASTER_METRICS_PORT: u16 = 16010; pub const HBASE_REGIONSERVER_PORT: u16 = 16020; pub const HBASE_REGIONSERVER_UI_PORT: u16 = 16030; +pub const HBASE_REGIONSERVER_METRICS_PORT: u16 = 16030; pub const HBASE_REST_PORT: u16 = 8080; pub const HBASE_REST_UI_PORT: u16 = 8085; +pub const HBASE_REST_METRICS_PORT: u16 = 8085; pub const LISTENER_VOLUME_NAME: &str = "listener"; pub const LISTENER_VOLUME_DIR: &str = "/stackable/listener"; @@ -542,6 +546,24 @@ impl v1alpha1::HbaseCluster { } } + /// Returns required metrics port name and metrics port number tuples depending on the role. + /// The metrics are available over the UI port. + pub fn metrics_ports(&self, role: &HbaseRole) -> Vec<(String, u16)> { + match role { + HbaseRole::Master => vec![( + HBASE_METRICS_PORT_NAME.to_string(), + HBASE_MASTER_METRICS_PORT, + )], + HbaseRole::RegionServer => vec![( + HBASE_METRICS_PORT_NAME.to_string(), + HBASE_REGIONSERVER_METRICS_PORT, + )], + HbaseRole::RestServer => { + vec![(HBASE_METRICS_PORT_NAME.to_string(), HBASE_REST_METRICS_PORT)] + } + } + } + pub fn service_port(&self, role: &HbaseRole) -> u16 { match role { HbaseRole::Master => HBASE_MASTER_PORT, @@ -550,6 +572,14 @@ impl v1alpha1::HbaseCluster { } } + pub fn metrics_port(&self, role: &HbaseRole) -> u16 { + match role { + HbaseRole::Master => HBASE_MASTER_METRICS_PORT, + HbaseRole::RegionServer => HBASE_REGIONSERVER_METRICS_PORT, + HbaseRole::RestServer => HBASE_REST_METRICS_PORT, + } + } + /// Name of the port used by the Web UI, which depends on HTTPS usage pub fn ui_port_name(&self) -> String { if self.has_https_enabled() { diff --git a/rust/operator-binary/src/hbase_controller.rs b/rust/operator-binary/src/hbase_controller.rs index ea19a09a..5fa13469 100644 --- a/rust/operator-binary/src/hbase_controller.rs +++ b/rust/operator-binary/src/hbase_controller.rs @@ -45,7 +45,7 @@ use stackable_operator::{ core::{DeserializeGuard, error_boundary}, runtime::controller::Action, }, - kvp::{Label, LabelError, Labels, ObjectLabels}, + kvp::{Annotations, Label, LabelError, Labels, ObjectLabels}, logging::controller::ReconcilerError, memory::{BinaryMultiple, MemoryQuantity}, product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config}, @@ -427,6 +427,14 @@ pub async fn reconcile_hbase( let rg_service = build_rolegroup_service(hbase, &hbase_role, &rolegroup, &resolved_product_image)?; + + let rg_metrics_service = build_rolegroup_metrics_service( + hbase, + &hbase_role, + &rolegroup, + &resolved_product_image, + )?; + let rg_configmap = build_rolegroup_config_map( hbase, &client.kubernetes_cluster_info, @@ -452,6 +460,12 @@ pub async fn reconcile_hbase( .with_context(|_| ApplyRoleGroupServiceSnafu { rolegroup: rolegroup.clone(), })?; + cluster_resources + .add(client, rg_metrics_service) + .await + .with_context(|_| ApplyRoleGroupServiceSnafu { + rolegroup: rolegroup.clone(), + })?; cluster_resources .add(client, rg_configmap) .await @@ -739,12 +753,9 @@ fn build_rolegroup_service( }) .collect(); - let prometheus_label = - Label::try_from(("prometheus.io/scrape", "true")).context(BuildLabelSnafu)?; - let metadata = ObjectMetaBuilder::new() .name_and_namespace(hbase) - .name(headless_service_name(&rolegroup.object_name())) + .name(rolegroup.rolegroup_headless_service_name()) .ownerreference_from_resource(hbase, None, Some(true)) .context(ObjectMissingMetadataForOwnerRefSnafu)? .with_recommended_labels(build_recommended_labels( @@ -754,7 +765,6 @@ fn build_rolegroup_service( &rolegroup.role_group, )) .context(ObjectMetaSnafu)? - .with_label(prometheus_label) .build(); let service_selector = @@ -778,6 +788,82 @@ fn build_rolegroup_service( }) } +/// The rolegroup metrics [`Service`] is a service that exposes metrics and a prometheus scraping label. +pub fn build_rolegroup_metrics_service( + hbase: &v1alpha1::HbaseCluster, + hbase_role: &HbaseRole, + rolegroup: &RoleGroupRef, + resolved_product_image: &ResolvedProductImage, +) -> Result { + let ports = hbase + .metrics_ports(hbase_role) + .into_iter() + .map(|(name, value)| ServicePort { + name: Some(name), + port: i32::from(value), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }) + .collect(); + + let service_selector = + Labels::role_group_selector(hbase, APP_NAME, &rolegroup.role, &rolegroup.role_group) + .context(BuildLabelSnafu)?; + + Ok(Service { + metadata: ObjectMetaBuilder::new() + .name_and_namespace(hbase) + .name(rolegroup.rolegroup_metrics_service_name()) + .ownerreference_from_resource(hbase, None, Some(true)) + .context(ObjectMissingMetadataForOwnerRefSnafu)? + .with_recommended_labels(build_recommended_labels( + hbase, + &resolved_product_image.app_version_label_value, + &rolegroup.role, + &rolegroup.role_group, + )) + .context(ObjectMetaSnafu)? + .with_label(Label::try_from(("prometheus.io/scrape", "true")).context(LabelBuildSnafu)?) + .with_annotations(prometheus_annotations(hbase, hbase_role)) + .build(), + spec: Some(ServiceSpec { + // Internal communication does not need to be exposed + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some(ports), + selector: Some(service_selector.into()), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }), + status: None, + }) +} + +/// Common annotations for Prometheus +/// +/// These annotations can be used in a ServiceMonitor. +/// +/// see also +fn prometheus_annotations(hbase: &v1alpha1::HbaseCluster, hbase_role: &HbaseRole) -> Annotations { + Annotations::try_from([ + ("prometheus.io/path".to_owned(), "/prometheus".to_owned()), + ( + "prometheus.io/port".to_owned(), + hbase.metrics_port(hbase_role).to_string(), + ), + ( + "prometheus.io/scheme".to_owned(), + if hbase.has_https_enabled() { + "https".to_owned() + } else { + "http".to_owned() + }, + ), + ("prometheus.io/scrape".to_owned(), "true".to_owned()), + ]) + .expect("should be valid annotations") +} + /// The rolegroup [`StatefulSet`] runs the rolegroup, as configured by the administrator. /// /// The [`Pod`](`stackable_operator::k8s_openapi::api::core::v1::Pod`)s are accessible through the corresponding [`Service`] (from [`build_rolegroup_service`]). @@ -1088,7 +1174,7 @@ fn build_rolegroup_statefulset( match_labels: Some(statefulset_match_labels.into()), ..LabelSelector::default() }, - service_name: Some(headless_service_name(&rolegroup_ref.object_name())), + service_name: Some(rolegroup_ref.rolegroup_headless_service_name()), template: pod_template, volume_claim_templates: listener_pvc, ..StatefulSetSpec::default() @@ -1198,10 +1284,6 @@ fn build_hbase_env_sh( Ok(result) } -fn headless_service_name(role_group_name: &str) -> String { - format!("{name}-headless", name = role_group_name) -} - #[cfg(test)] mod test { use rstest::rstest; From 8495dda4134b7063788b2de260949a523e4184c4 Mon Sep 17 00:00:00 2001 From: xeniape Date: Wed, 15 Oct 2025 14:15:45 +0200 Subject: [PATCH 2/2] add changelog entry --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a4a1636..510f5ae5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,14 +10,18 @@ - `EOS_CHECK_MODE` (`--eos-check-mode`) to set the EoS check mode. Currently, only "offline" is supported. - `EOS_INTERVAL` (`--eos-interval`) to set the interval in which the operator checks if it is EoS. - `EOS_DISABLED` (`--eos-disabled`) to disable the EoS checker completely. +- Add `metrics` Services ([#701]). ### Changed - Changed env-vars to be consistent with config-utils in the entrypoint script ([#700]). +- BREAKING: The `prometheus.io/scrape` label moved from the `headless` Service to the `metrics` Service, which + uses `metrics` as the port name instead of the previous `ui-http`/`ui-https` port name ([#701]). [#691]: https://github.com/stackabletech/hbase-operator/pull/691 [#697]: https://github.com/stackabletech/hbase-operator/pull/697 [#700]: https://github.com/stackabletech/hbase-operator/pull/700 +[#701]: https://github.com/stackabletech/hbase-operator/pull/701 ## [25.7.0] - 2025-07-23