diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f5ff8d3..466f91f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#748]). +- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#748]). + +### Changed + +- BREAKING: The per-rolegroup services now only serves the HTTP port and has a `-headless` suffix to better indicate their + purpose and to be consistent with other operators ([#748]). +- BREAKING: The per-role server service is now prefixed with `-server` to be consistent with other operators ([#748]). + +[#748]: https://github.com/stackabletech/opa-operator/pull/748 + ## [25.7.0] - 2025-07-23 ## [25.7.0-rc1] - 2025-07-18 diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index 87d5291e..0c6f0384 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -50,7 +50,7 @@ use stackable_operator::{ core::{DeserializeGuard, error_boundary}, runtime::{controller::Action, reflector::ObjectRef}, }, - kvp::{Label, LabelError, Labels, ObjectLabels}, + kvp::{LabelError, Labels, ObjectLabels}, logging::controller::ReconcilerError, memory::{BinaryMultiple, MemoryQuantity}, product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config}, @@ -91,6 +91,7 @@ pub const BUNDLES_ACTIVE_DIR: &str = "/bundles/active"; pub const BUNDLES_INCOMING_DIR: &str = "/bundles/incoming"; pub const BUNDLES_TMP_DIR: &str = "/bundles/tmp"; pub const BUNDLE_BUILDER_PORT: i32 = 3030; +pub const OPA_STACKABLE_SERVICE_NAME: &str = "stackable"; const CONFIG_VOLUME_NAME: &str = "config"; const CONFIG_DIR: &str = "/stackable/config"; @@ -185,6 +186,12 @@ pub enum Error { rolegroup: RoleGroupRef, }, + #[snafu(display("failed to apply metrics Service for [{rolegroup}]"))] + ApplyRoleGroupMetricsService { + source: stackable_operator::cluster_resources::Error, + rolegroup: RoleGroupRef, + }, + #[snafu(display("failed to build ConfigMap for [{rolegroup}]"))] BuildRoleGroupConfig { source: stackable_operator::builder::configmap::Error, @@ -337,19 +344,20 @@ pub struct OpaClusterConfigFile { bundles: OpaClusterBundle, #[serde(skip_serializing_if = "Option::is_none")] decision_logs: Option, + status: Option, } impl OpaClusterConfigFile { pub fn new(decision_logging: Option) -> Self { Self { services: vec![OpaClusterConfigService { - name: String::from("stackable"), - url: String::from("http://localhost:3030/opa/v1"), + name: OPA_STACKABLE_SERVICE_NAME.to_owned(), + url: "http://localhost:3030/opa/v1".to_owned(), }], bundles: OpaClusterBundle { stackable: OpaClusterBundleConfig { - service: String::from("stackable"), - resource: String::from("opa/bundle.tar.gz"), + service: OPA_STACKABLE_SERVICE_NAME.to_owned(), + resource: "opa/bundle.tar.gz".to_owned(), persist: true, polling: OpaClusterBundleConfigPolling { min_delay_seconds: 10, @@ -358,6 +366,12 @@ impl OpaClusterConfigFile { }, }, decision_logs: decision_logging, + // Enable more Prometheus metrics, such as bundle loads + // See https://www.openpolicyagent.org/docs/monitoring#status-metrics + status: Some(OpaClusterConfigStatus { + service: OPA_STACKABLE_SERVICE_NAME.to_owned(), + prometheus: true, + }), } } } @@ -392,6 +406,12 @@ pub struct OpaClusterConfigDecisionLog { console: bool, } +#[derive(Serialize, Deserialize)] +struct OpaClusterConfigStatus { + service: String, + prometheus: bool, +} + pub async fn reconcile_opa( opa: Arc>, ctx: Arc, @@ -489,7 +509,10 @@ pub async fn reconcile_opa( &rolegroup, &merged_config, )?; - let rg_service = build_rolegroup_service(opa, &resolved_product_image, &rolegroup)?; + let rg_service = + build_rolegroup_headless_service(opa, &resolved_product_image, &rolegroup)?; + let rg_metrics_service = + build_rolegroup_metrics_service(opa, &resolved_product_image, &rolegroup)?; let rg_daemonset = build_server_rolegroup_daemonset( opa, &resolved_product_image, @@ -515,6 +538,12 @@ pub async fn reconcile_opa( .with_context(|_| ApplyRoleGroupServiceSnafu { rolegroup: rolegroup.clone(), })?; + cluster_resources + .add(client, rg_metrics_service) + .await + .with_context(|_| ApplyRoleGroupServiceSnafu { + rolegroup: rolegroup.clone(), + })?; ds_cond_builder.add( cluster_resources .add(client, rg_daemonset.clone()) @@ -611,12 +640,7 @@ pub fn build_server_role_service( let service_spec = ServiceSpec { type_: Some(opa.spec.cluster_config.listener_class.k8s_service_type()), - ports: Some(vec![ServicePort { - name: Some(APP_PORT_NAME.to_string()), - port: APP_PORT.into(), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }]), + ports: Some(data_service_ports()), selector: Some(service_selector_labels.into()), internal_traffic_policy: Some("Local".to_string()), ..ServiceSpec::default() @@ -632,17 +656,14 @@ pub fn build_server_role_service( /// The rolegroup [`Service`] is a headless service that allows direct access to the instances of a certain rolegroup /// /// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing. -fn build_rolegroup_service( +fn build_rolegroup_headless_service( opa: &v1alpha1::OpaCluster, resolved_product_image: &ResolvedProductImage, rolegroup: &RoleGroupRef, ) -> Result { - let prometheus_label = - Label::try_from(("prometheus.io/scrape", "true")).context(BuildLabelSnafu)?; - let metadata = ObjectMetaBuilder::new() .name_and_namespace(opa) - .name(rolegroup.object_name()) + .name(rolegroup.rolegroup_headless_service_name()) .ownerreference_from_resource(opa, None, Some(true)) .context(ObjectMissingMetadataForOwnerRefSnafu)? .with_recommended_labels(build_recommended_labels( @@ -652,19 +673,20 @@ fn build_rolegroup_service( &rolegroup.role_group, )) .context(ObjectMetaSnafu)? - .with_label(prometheus_label) .build(); - let service_selector_labels = - Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group) - .context(BuildLabelSnafu)?; - let service_spec = ServiceSpec { - // Internal communication does not need to be exposed + // Currently we don't offer listener-exposition of OPA mostly due to security concerns. + // OPA is currently public within the Kubernetes (without authentication). + // Opening it up to outside of Kubernetes might worsen things. + // We are open to implement listener-integration, but this needs to be thought through before + // implementing it. + // Note: We have kind of similar situations for HMS and Zookeeper, as the authentication + // options there are non-existent (mTLS still opens plain port) or suck (Kerberos). type_: Some("ClusterIP".to_string()), cluster_ip: Some("None".to_string()), - ports: Some(service_ports()), - selector: Some(service_selector_labels.into()), + ports: Some(data_service_ports()), + selector: Some(role_group_selector_labels(opa, rolegroup)?.into()), publish_not_ready_addresses: Some(true), ..ServiceSpec::default() }; @@ -676,6 +698,55 @@ fn build_rolegroup_service( }) } +/// The rolegroup metrics [`Service`] is a service that exposes metrics and has the +/// prometheus.io/scrape label. +fn build_rolegroup_metrics_service( + opa: &v1alpha1::OpaCluster, + resolved_product_image: &ResolvedProductImage, + rolegroup: &RoleGroupRef, +) -> Result { + let labels = Labels::try_from([("prometheus.io/scrape", "true")]) + .expect("static Prometheus labels must be valid"); + + let metadata = ObjectMetaBuilder::new() + .name_and_namespace(opa) + .name(rolegroup.rolegroup_metrics_service_name()) + .ownerreference_from_resource(opa, None, Some(true)) + .context(ObjectMissingMetadataForOwnerRefSnafu)? + .with_recommended_labels(build_recommended_labels( + opa, + &resolved_product_image.app_version_label, + &rolegroup.role, + &rolegroup.role_group, + )) + .context(ObjectMetaSnafu)? + .with_labels(labels) + .build(); + + let service_spec = ServiceSpec { + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some(vec![metrics_service_port()]), + selector: Some(role_group_selector_labels(opa, rolegroup)?.into()), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata, + spec: Some(service_spec), + status: None, + }) +} + +/// Returns the [`Labels`] that can be used to select all Pods that are part of the roleGroup. +fn role_group_selector_labels( + opa: &v1alpha1::OpaCluster, + rolegroup: &RoleGroupRef, +) -> Result { + Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group) + .context(BuildLabelSnafu) +} + /// The rolegroup [`ConfigMap`] configures the rolegroup based on the configuration given by the administrator fn build_server_rolegroup_config_map( opa: &v1alpha1::OpaCluster, @@ -904,6 +975,11 @@ fn build_server_rolegroup_daemonset( format!("{STACKABLE_LOG_DIR}/containerdebug"), ) .add_container_port(APP_PORT_NAME, APP_PORT.into()) + // If we also add a container port "metrics" pointing to the same port number, we get a + // + // .spec.template.spec.containers[name="opa"].ports: duplicate entries for key [containerPort=8081,protocol="TCP"] + // + // So we don't do that .add_volume_mount(CONFIG_VOLUME_NAME, CONFIG_DIR) .context(AddVolumeMountSnafu)? .add_volume_mount(LOG_VOLUME_NAME, STACKABLE_LOG_DIR) @@ -1387,22 +1463,24 @@ fn build_prepare_start_command( prepare_container_args } -fn service_ports() -> Vec { - vec![ - ServicePort { - name: Some(APP_PORT_NAME.to_string()), - port: APP_PORT.into(), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }, - ServicePort { - name: Some(METRICS_PORT_NAME.to_string()), - port: 9504, // Arbitrary port number, this is never actually used anywhere - protocol: Some("TCP".to_string()), - target_port: Some(IntOrString::String(APP_PORT_NAME.to_string())), - ..ServicePort::default() - }, - ] +fn data_service_ports() -> Vec { + // Currently only HTTP is exposed + vec![ServicePort { + name: Some(APP_PORT_NAME.to_string()), + port: APP_PORT.into(), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }] +} + +fn metrics_service_port() -> ServicePort { + ServicePort { + name: Some(METRICS_PORT_NAME.to_string()), + // The metrics are served on the same port as the HTTP traffic + port: APP_PORT.into(), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + } } /// Creates recommended `ObjectLabels` to be used in deployed resources diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 010a0c29..2bcbf0f5 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -17,7 +17,7 @@ use stackable_operator::{ merge::Merge, }, k8s_openapi::apimachinery::pkg::api::resource::Quantity, - kube::CustomResource, + kube::{CustomResource, ResourceExt}, product_config_utils::Configuration, product_logging::{self, spec::Logging}, role_utils::{ @@ -326,7 +326,11 @@ impl v1alpha1::OpaCluster { /// The name of the role-level load-balanced Kubernetes `Service` pub fn server_role_service_name(&self) -> Option { - self.metadata.name.clone() + Some(format!( + "{cluster_name}-{role}", + cluster_name = self.name_any(), + role = v1alpha1::OpaRole::Server + )) } /// The fully-qualified domain name of the role-level load-balanced Kubernetes `Service` diff --git a/tests/templates/kuttl/aas-user-info/30-assert.yaml b/tests/templates/kuttl/aas-user-info/30-assert.yaml index 0d0ef28f..c9f2337d 100644 --- a/tests/templates/kuttl/aas-user-info/30-assert.yaml +++ b/tests/templates/kuttl/aas-user-info/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test' diff --git a/tests/templates/kuttl/ad-user-info/30-assert.yaml b/tests/templates/kuttl/ad-user-info/30-assert.yaml index 0d0ef28f..c9f2337d 100644 --- a/tests/templates/kuttl/ad-user-info/30-assert.yaml +++ b/tests/templates/kuttl/ad-user-info/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test' diff --git a/tests/templates/kuttl/keycloak-user-info/30-assert.yaml b/tests/templates/kuttl/keycloak-user-info/30-assert.yaml index 0d0ef28f..c9f2337d 100644 --- a/tests/templates/kuttl/keycloak-user-info/30-assert.yaml +++ b/tests/templates/kuttl/keycloak-user-info/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test' diff --git a/tests/templates/kuttl/logging/test_log_aggregation.py b/tests/templates/kuttl/logging/test_log_aggregation.py index 3ed838f0..8ba52855 100755 --- a/tests/templates/kuttl/logging/test_log_aggregation.py +++ b/tests/templates/kuttl/logging/test_log_aggregation.py @@ -4,18 +4,16 @@ def send_opa_decision_request(): - response = requests.post( - 'http://test-opa:8081/v1/data/test/world' - ) + response = requests.post("http://test-opa-server:8081/v1/data/test/world") + + assert response.status_code == 200, "Cannot access the API of the opa cluster." - assert response.status_code == 200, \ - 'Cannot access the API of the opa cluster.' def check_sent_events(): response = requests.post( - 'http://opa-vector-aggregator:8686/graphql', + "http://opa-vector-aggregator:8686/graphql", json={ - 'query': """ + "query": """ { transforms(first:100) { nodes { @@ -29,31 +27,32 @@ def check_sent_events(): } } """ - } + }, ) - assert response.status_code == 200, \ - 'Cannot access the API of the vector aggregator.' + assert response.status_code == 200, ( + "Cannot access the API of the vector aggregator." + ) result = response.json() - transforms = result['data']['transforms']['nodes'] + transforms = result["data"]["transforms"]["nodes"] for transform in transforms: - sentEvents = transform['metrics']['sentEventsTotal'] - componentId = transform['componentId'] + sentEvents = transform["metrics"]["sentEventsTotal"] + componentId = transform["componentId"] - if componentId == 'filteredInvalidEvents': - assert sentEvents is None or \ - sentEvents['sentEventsTotal'] == 0, \ - 'Invalid log events were sent.' + if componentId == "filteredInvalidEvents": + assert sentEvents is None or sentEvents["sentEventsTotal"] == 0, ( + "Invalid log events were sent." + ) else: - assert sentEvents is not None and \ - sentEvents['sentEventsTotal'] > 0, \ - f'No events were sent in "{componentId}".' + assert sentEvents is not None and sentEvents["sentEventsTotal"] > 0, ( + f'No events were sent in "{componentId}".' + ) -if __name__ == '__main__': +if __name__ == "__main__": send_opa_decision_request() time.sleep(10) check_sent_events() - print('Test successful!') + print("Test successful!") diff --git a/tests/templates/kuttl/smoke/20-assert.yaml b/tests/templates/kuttl/smoke/20-assert.yaml index 6041b967..579dfd86 100644 --- a/tests/templates/kuttl/smoke/20-assert.yaml +++ b/tests/templates/kuttl/smoke/20-assert.yaml @@ -6,7 +6,7 @@ timeout: 300 apiVersion: apps/v1 kind: StatefulSet metadata: - name: test-regorule + name: test-opa status: readyReplicas: 1 replicas: 1 diff --git a/tests/templates/kuttl/smoke/20-install-test-regorule.yaml b/tests/templates/kuttl/smoke/20-install-test-opa.yaml similarity index 78% rename from tests/templates/kuttl/smoke/20-install-test-regorule.yaml rename to tests/templates/kuttl/smoke/20-install-test-opa.yaml index 816f6148..05d33126 100644 --- a/tests/templates/kuttl/smoke/20-install-test-regorule.yaml +++ b/tests/templates/kuttl/smoke/20-install-test-opa.yaml @@ -2,21 +2,21 @@ apiVersion: apps/v1 kind: StatefulSet metadata: - name: test-regorule + name: test-opa labels: - app: test-regorule + app: test-opa spec: replicas: 1 selector: matchLabels: - app: test-regorule + app: test-opa template: metadata: labels: - app: test-regorule + app: test-opa spec: containers: - - name: test-regorule + - name: test-opa image: oci.stackable.tech/sdp/testing-tools:0.2.0-stackable0.0.0-dev stdin: true tty: true diff --git a/tests/templates/kuttl/smoke/30-assert.yaml b/tests/templates/kuttl/smoke/30-assert.yaml index 0d0ef28f..9c83596e 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml +++ b/tests/templates/kuttl/smoke/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-opa-0 -- python /tmp/30_test-regorule.py -u 'http://test-opa-server:8081/v1/data/test' diff --git a/tests/templates/kuttl/smoke/30-prepare-test-opa.yaml b/tests/templates/kuttl/smoke/30-prepare-test-opa.yaml new file mode 100644 index 00000000..84162ae6 --- /dev/null +++ b/tests/templates/kuttl/smoke/30-prepare-test-opa.yaml @@ -0,0 +1,5 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl cp -n $NAMESPACE ./30_test-regorule.py test-opa-0:/tmp + - script: kubectl cp -n $NAMESPACE ./30_test-metrics.py test-opa-0:/tmp diff --git a/tests/templates/kuttl/smoke/30-prepare-test-regorule.yaml b/tests/templates/kuttl/smoke/30-prepare-test-regorule.yaml deleted file mode 100644 index 1623156a..00000000 --- a/tests/templates/kuttl/smoke/30-prepare-test-regorule.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: kuttl.dev/v1beta1 -kind: TestStep -commands: - - script: kubectl cp -n $NAMESPACE ./test-regorule.py test-regorule-0:/tmp diff --git a/tests/templates/kuttl/smoke/30_test-metrics.py b/tests/templates/kuttl/smoke/30_test-metrics.py new file mode 100644 index 00000000..54625f03 --- /dev/null +++ b/tests/templates/kuttl/smoke/30_test-metrics.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +import requests + +metrics_url = "http://test-opa-server-default-metrics:8081/metrics" +response = requests.get(metrics_url) + +assert response.status_code == 200, "Metrics endpoint must return a 200 status code" +assert "bundle_loaded_counter" in response.text, ( + f"Metric bundle_loaded_counter should exist in {metrics_url}" +) +print("Metrics test successful!") diff --git a/tests/templates/kuttl/smoke/test-regorule.py b/tests/templates/kuttl/smoke/30_test-regorule.py similarity index 96% rename from tests/templates/kuttl/smoke/test-regorule.py rename to tests/templates/kuttl/smoke/30_test-regorule.py index 3e2a78b6..d3caedc2 100755 --- a/tests/templates/kuttl/smoke/test-regorule.py +++ b/tests/templates/kuttl/smoke/30_test-regorule.py @@ -34,7 +34,7 @@ and "hello" in response["result"] and response["result"]["hello"] ): - print("Test successful!") + print("Regorule test successful!") exit(0) else: print( diff --git a/tests/templates/kuttl/smoke/31-assert.yaml b/tests/templates/kuttl/smoke/31-assert.yaml new file mode 100644 index 00000000..0084e371 --- /dev/null +++ b/tests/templates/kuttl/smoke/31-assert.yaml @@ -0,0 +1,7 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +metadata: + name: test-metrics +commands: + - script: kubectl exec -n $NAMESPACE test-opa-0 -- python /tmp/30_test-metrics.py