diff --git a/docs/latest/modules/en/attachments/suse-observability_logs_collector.sh b/docs/latest/modules/en/attachments/suse-observability_logs_collector.sh old mode 100644 new mode 100755 index c52af5b3..aba00852 --- a/docs/latest/modules/en/attachments/suse-observability_logs_collector.sh +++ b/docs/latest/modules/en/attachments/suse-observability_logs_collector.sh @@ -1,5 +1,6 @@ #!/bin/bash +ELASTICSEARCH_LOGS=false ELASTICSEARCH_LOGS=false ELASTICSEARCH_RANGE="7d" while getopts "her:" option; do @@ -24,6 +25,7 @@ EOF exit 0;; e) # Collect elasticsearch logs ELASTICSEARCH_LOGS=true;; + ELASTICSEARCH_LOGS=true;; r) # Time range for elasticsearch logs ELASTICSEARCH_RANGE=$OPTARG;; \?) # Invalid option @@ -50,19 +52,25 @@ done # skip helm release analysis when not all its dependencies are present HELM_RELEASES=true +HELM_RELEASES=true for cmd in base64 gzip jq do if ! command -v $cmd &>/dev/null; then echo "$cmd is not installed. Skipping analysis of helm releases." HELM_RELEASES=false + HELM_RELEASES=false fi done # Check if KUBECONFIG is set -if [[ -z "$KUBECONFIG" || ! -f "$KUBECONFIG" ]]; then - echo "Error: KUBECONFIG is not set. Please ensure KUBECONFIG is set to the path of a valid kubeconfig file before running this script." - echo "If kubeconfig is not set, use the command: export KUBECONFIG=PATH-TO-YOUR/kubeconfig. Exiting..." - exit 1 +if ! kubectl config current-context > /dev/null; then + echo "Error: Could not find kubernetes cluster to connect to." + echo "Please ensure KUBECONFIG is set to the path of a valid kubeconfig file before running this script." + echo "If kubeconfig is not set, use the command: export KUBECONFIG=PATH-TO-YOUR/kubeconfig. Exiting..." + exit 1 +else + CONTEXT=$(kubectl config current-context) + echo "Retrieving logs from kubernetes context: $CONTEXT" fi # Check if namespace exist or not @@ -71,7 +79,7 @@ if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then exit 1 fi # Directory to store logs -OUTPUT_DIR="${NAMESPACE}_logs_$(date +%Y%m%d%H%M%S)" +OUTPUT_DIR="${NAMESPACE}_logs_$(date -u +%Y-%m-%d_%H-%M-%SZ)" ARCHIVE_FILE="${OUTPUT_DIR}.tar.gz" techo() { @@ -131,6 +139,20 @@ collect_helm_releases() { techo "Collecting helm releases..." mkdir -p "$OUTPUT_DIR/releases" + # Restrict keys extracted from Helm values to only this include-list to avoid including any + included_keys='["resources", "affinity", "nodeSelector", "tolerations"]' + + # 1. --argjson keys "$included_keys": Passes the shell variable as a JSON array $keys. + # 2. . as $input: Saves the entire original JSON into a variable $input. + # 3. [ paths | ... ]: Gathers all paths from the JSON. + # 4. select(.[-1] as $last | $keys | index($last)): Selects only paths where + # the last element (.[-1]) is found inside the $keys array. + # 5. reduce .[] as $p (null; ...): Starts with an empty (null) document + # and iterates over every path ($p) that was selected. + # 6. setpath($p; $input | getpath($p)): For each path, it sets that path + # in the *new* document, pulling the *value* from the original $input. + + # Restrict keys extracted from Helm values to only this include-list to avoid including any included_keys='["resources", "affinity", "nodeSelector", "tolerations"]' @@ -149,6 +171,7 @@ collect_helm_releases() { kubectl -n "$NAMESPACE" get secret "$release" -o jsonpath='{.data.release}' | \ base64 --decode | base64 --decode | gzip -d | \ jq --argjson keys "$included_keys" '{ info: .info, metadata: .chart.metadata, config: ( .config as $input | [ .config | paths | select(.[-1] as $last | $keys | index($last)) ] | reduce .[] as $p (null; setpath($p; $input | getpath($p)))) }' > "$OUTPUT_DIR/releases/$release" + jq --argjson keys "$included_keys" '{ info: .info, metadata: .chart.metadata, config: ( .config as $input | [ .config | paths | select(.[-1] as $last | $keys | index($last)) ] | reduce .[] as $p (null; setpath($p; $input | getpath($p)))) }' > "$OUTPUT_DIR/releases/$release" done } @@ -290,6 +313,18 @@ collect_hbase_report() { fi } +collect_workload_observer_data() { + techo "Collecting workload observer data..." + POD=$(kubectl -n "$NAMESPACE" get pod -l app.kubernetes.io/component=workload-observer -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [ "$POD" == "" ]; then + techo "INFO: No workload observer pod found, skipping" + return + fi + + mkdir -p "$OUTPUT_DIR/workload-observer-data" + kubectl -n "$NAMESPACE" cp "$POD:/report-data" "$OUTPUT_DIR/workload-observer-data/" > /dev/null 2>&1 & +} + archive_and_cleanup() { echo "Creating archive $ARCHIVE_FILE..." tar -czf "$ARCHIVE_FILE" "$OUTPUT_DIR" @@ -348,10 +383,14 @@ collect_pod_logs collect_pod_disk_usage collect_hdfs_report collect_hbase_report +collect_hdfs_report +collect_hbase_report collect_yaml_configs +collect_workload_observer_data if $HELM_RELEASES; then collect_helm_releases fi +if $ELASTICSEARCH_LOGS; then if $ELASTICSEARCH_LOGS; then collect_pod_logs_from_elasticsearch fi diff --git a/docs/latest/modules/en/nav.adoc b/docs/latest/modules/en/nav.adoc index c31732e1..2e8aa596 100644 --- a/docs/latest/modules/en/nav.adoc +++ b/docs/latest/modules/en/nav.adoc @@ -139,6 +139,7 @@ *** xref:setup/release-notes/v2.6.0.adoc[v2.6.0 - 29/Sep/2025] *** xref:setup/release-notes/v2.6.1.adoc[v2.6.1 - 13/Oct/2025] *** xref:setup/release-notes/v2.6.2.adoc[v2.6.2 - 03/Nov/2025] +*** xref:setup/release-notes/v2.6.3.adoc[v2.6.3 - 25/Nov/2025] ** xref:setup/upgrade-stackstate/README.adoc[Upgrade SUSE Observability] *** xref:setup/upgrade-stackstate/migrate-from-6.adoc[Migration from StackState] *** xref:setup/upgrade-stackstate/steps-to-upgrade.adoc[Steps to upgrade] diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index 2f8d1d95..dc02eb18 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -24,6 +24,54 @@ Note that by adding more time to the data retention period, the amount of data s When lowering the retention period, it can take some time until disk space is freed up (at least 15 minutes). +=== Troubleshooting topology disk space issues +In case of running into disk space issues, a log line - `Not enough replicas was chosen. Reason: {NOT_ENOUGH_STORAGE_SPACE=1` appears in the namenode. Follow the below steps to deal with this scenario: + +* Lower the retention, prepare the instance to recover disk space immediately, and trigger a helm upgrade: +[,yaml] +---- +stackstate: + topology: + # Retention set to 1 week in case you are running with the default 1 month + retentionHours: 144 +hbase: + console: + enabled: true + replicaCount: 1 + hdfs: + datanode: + extraEnv: + open: + HDFS_CONF_dfs_datanode_du_reserved_pct: "0" +---- + +[NOTE] +==== +Wait until all the hbase and hdfs pods are stable before moving on to the next step. +==== + +* Trigger the compaction of historic data: +[,bash] +---- +kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediately\(\)\)" +---- + +* Follow the progress using: +---- +kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediatelyStatus\(\)\)" +---- + +* In case the budgeted disk space is insufficient, contact . + +* Restore the settings. Once the status is no longer in progress - `Status(inProgress = false, lastFailure = null)`, trigger a helm upgrade to preserving the new retention as part of your values. +[,yaml] +---- +stackstate: + topology: + # Retention set to 1 week in case you are running with the default 1 month + retentionHours: 144 +---- + == Retention of events and logs === SUSE Observability data store diff --git a/docs/latest/modules/en/pages/setup/install-stackstate/kubernetes_openshift/ack.adoc b/docs/latest/modules/en/pages/setup/install-stackstate/kubernetes_openshift/ack.adoc index 3fe6f0fd..51cd5d99 100644 --- a/docs/latest/modules/en/pages/setup/install-stackstate/kubernetes_openshift/ack.adoc +++ b/docs/latest/modules/en/pages/setup/install-stackstate/kubernetes_openshift/ack.adoc @@ -15,6 +15,10 @@ We provide a dedicated set of Helm values that adjusts all volume sizes to meet zookeeper: persistence: size: 20Gi +hbase: + tephra: + persistence: + size: 20Gi stackstate: components: checks: @@ -34,6 +38,9 @@ stackstate: vmagent: persistence: size: 20Gi + workloadObserver: + persistence: + size: 20Gi features: storeTransactionLogsToPVC: volumeSize: 20Gi diff --git a/docs/latest/modules/en/pages/setup/release-notes/v2.6.3.adoc b/docs/latest/modules/en/pages/setup/release-notes/v2.6.3.adoc new file mode 100644 index 00000000..d39320fe --- /dev/null +++ b/docs/latest/modules/en/pages/setup/release-notes/v2.6.3.adoc @@ -0,0 +1,32 @@ += v2.6.3 - 25/Nov/2025 +:revdate: 2025-11-25 +:page-revdate: {revdate} +:description: SUSE Observability Self-hosted + +== Release Notes: {stackstate-product-name} Helm Chart v2.6.3 + +== New Features & Enhancements + +* *HDFS Upgrade:* HDFS (Hadoop Distributed File System) and its associated dependencies have been upgraded. +* *StackPack: Partial Topology Sync Monitor:* A new monitor has been added to the StackState StackPack to alert on **partial Topology Synchronization snapshots**. +* *vmagent Resource Increase:* The memory and CPU resource requirements for the `vmagent` component have been increased in the `4000-ha` profile. +* *Image Upgrades:* +** The **Kafka** container image has been upgraded. +** The **ClickHouse** container image has been upgraded. + +== Bug Fixes + +* *OpenTelemetry Metric Scoping:* Fixed a critical issue where metrics ingested via the OpenTelemetry collector were missing the `_scope_` label. This prevented **scoped users** from being able to observe these metrics. +* *Metric Explorer Sorting:* The **Metric Explorer** now uses numerical sorting for values in the value column. +* *Platform: StackGraph Corruption (Timed-Out Transactions):* Fixed a **StackGraph corruption issue** where data from timed-out transactions that should have been rolled back could inadvertently reappear. +* *Platform: State Pod Validation:* Added **additional data validation and logging** to the state pod for improved stability and debugging. +* *StackGraph: Edge Deletion Invariant:* Added an invariant to prevent inconsistent edge references when performing a delete edge operation in **StackGraph**. +* *StackGraph Integrity Verifier:* An **experimental perpetual integrity verifier** has been added for StackGraph. It can be enabled by setting `hbase.console.integrity.enabled=true`. +* *StackPack Remediation Guides:* Fixed several remediation guides within the SUSE Observability stackpack that incorrectly referenced `tags` instead of the correct term, **`labels`**. +* *Duplicate OpenTelemetry StackPack:* Removed a duplicate **OpenTelemetry stackpack** installation. +* *Platform: Agent Restart Snapshot Loop:* Fixed an issue where a restart of an agent could cause the **'active snapshot'** to continuously occur. +* *Platform: Kafka JMX OOM Fix:* Resolved an Out-Of-Memory (OOM) issue for the Kafka JMX container on RKE2 Kubernetes versions 1.31 and 1.30. + +=== Agent Bug Fixes + +* *Agent: /proc//stat Panic:* The agent now includes a fix to prevent a panic when a `/proc//stat` file is found to be empty.git c \ No newline at end of file diff --git a/docs/latest/modules/en/pages/setup/security/rbac/rbac_rancher.adoc b/docs/latest/modules/en/pages/setup/security/rbac/rbac_rancher.adoc index 92030715..93370612 100644 --- a/docs/latest/modules/en/pages/setup/security/rbac/rbac_rancher.adoc +++ b/docs/latest/modules/en/pages/setup/security/rbac/rbac_rancher.adoc @@ -16,28 +16,28 @@ For Rancher RBAC to function, * the {stackstate-product-name} Agent must have the RBAC Agent enabled and must authenticate using a service token. ==== -Every authenticated user has the *Instance Basic Access* role that allows them to use the system. These permissions provide access to the views, settings, metric bindings, and lets a user see system notifications. They do NOT grant access to any {stackstate-product-name} data. To see any data, a user needs to be given an additional role. Two directions for extending the *Instance Basic Access* role are provided with Rancher *Role Templates*: +Every authenticated user has the *Instance Basic Access* role that allows them to use the system. These permissions provide access to the views, settings, metric bindings, and lets a user see system notifications. They do NOT grant access to any {stackstate-product-name} data. In order to see any data, a user needs to be given an additional role. Two directions for extending the *Instance Basic Access* role are provided with Rancher *Role Templates*: -Instance Roles:: Enables you to configure or personalize {stackstate-product-name}. +Instance Roles:: Enables you to configure or personalize {stacktate-product-name}. Scoped Roles:: Grants access to {stackstate-product-name} data from observed clusters. == Instance roles -You can assign the *Role Templates* for *Instance Roles* to users or groups in the *Project* that is running {stackstate-product-name}. If no instance roles are explicitly assigned to a member of a project, then the permissions of the *Instance Basic Access* role is applied. +You can assign the *Role Templates* for *Instance Roles* to users or groups in the *Project* that is running {stackstate-product-name}. If no instance roles are explicitly assigned to a member of a project, then they will have the permissions of the *Instance Basic Access* role. === Instance roles with access to {stackstate-product-name} data -A couple of "global" roles allow access to all {stackstate-product-name} data - in any of the observed clusters. These roles are intended to be used for setting up the system and for troubleshooting system-level problems. For users with any of these roles, it is not necessary to configure xref:scoped[Scoped Roles]. +A couple of "global" roles allow access to all {stackstate-product-name} data - in any of the observed clusters. These roles are intended to be used for setting up the system and for troubleshooting system-level problems. For users with any of these roles, it is not necessary to configure xref:scoped[Scoped Roles]. Instance Admin:: Grants full access to all views and all permissions. -Instance Troubleshooter:: Grants all permissions required to use SUSE Observability for troubleshooting, including the ability to enable/disable monitors, create custom views, and use the CLI. -Instance Observer:: Grants access to all data in a SUSE Observability instance. +Instance Troubleshooter:: Grants all permissions required to use SUSE Observability for troubleshooting, including the ability to enable/disable monitors, create custom views, and use the CLI. +Instance Observer:: Grants access to all data in a SUSE Observability instance. === Instance roles without access to {stackstate-product-name} data -These roles need to be combined with the *Instance Observer* role or one of the xref:scoped[Scoped Roles] (see below). Otherwise, no {stackstate-product-name} data is accessible and a "No components found" message appears in the UI. This applies to all Rancher users, including users, such as Project owners. +These roles need to be combined with the *Instance Observer* role or one of the xref:scoped[Scoped Roles] (see below). Otherwise, no {stackstate-product-name} data is accessible and the UI will show a "No components found" message. This applies to all Rancher users, including users, such as Project owners. -Instance Recommended Access:: Grants recommended permissions to use SUSE Observability. This role includes permissions that are not strictly necessary, but provide (limited) means of personalization {stackstate-product-name}. +Instance Recommended Access:: Grants recommended permissions to use SUSE Observability. This role includes permissions that are not strictly necessary, but provide (limited) means of personalization {stackstate-product-name}. Instance Basic Access:: Grants minimal permissions to use {stackstate-product-name}. This role does not need to be explicitly assigned and there is no *Role Template* for it; every logged-in user has it. You can find the permissions assigned to each predefined SUSE Observability role below. For details of the different permissions and how to manage them using the `sts` CLI, see xref:/setup/security/rbac/rbac_permissions.adoc[Role based access control (RBAC) permissions] @@ -59,12 +59,30 @@ These permissions are granted to all users. |views |get |=== +-- +Basic Access:: ++ +-- +Basic access grants minimal permissions for using SUSE Observability. To be combined with an Observer (Instance, Cluster or Project). +These permissions are granted to all users. + +|=== +|Resource |Verbs + +|metric-bindings |get +|settings |get +|system-notifications |get +|views |get +|=== + -- Recommended Access:: + -- Recommended access grants permissions that are not strictly necessary, but that make SUSE Observability a lot more useful. It provides a limited degree of personalization. To be combined with an Observer (Instance, Cluster or Project). +Recommended access grants permissions that are not strictly necessary, but that make SUSE Observability a lot more useful. It provides a limited degree of personalization. +To be combined with an Observer (Instance, Cluster or Project). |=== |Resource |Verbs @@ -80,7 +98,7 @@ To be combined with an Observer (Instance, Cluster or Project). Observer:: + -- -Observer grants access to all observability data in a SUSE Observability instance. Combine with *Recommended Access* for a better experience. +Observer grants access to all observability data in a SUSE Observability instance. Combine with *Recommended Access* for a better experience. |=== |Resource |Verbs @@ -121,6 +139,7 @@ The Troubleshooter role has access to all data available in SUSE Observability a -- Admin:: +Admin:: + -- The Administrator role has all permissions assigned. @@ -160,7 +179,7 @@ The Administrator role has all permissions assigned. [#scoped] == Scoped roles -You can assign the following *Role Templates* to users or groups in an observed cluster. They grant access to {stackstate-product-name} data coming from (a *Project* in) the *Cluster*, giving a user permission to read topology, metrics, logs and trace data. +You can assign the following *Role Templates* to users or groups in an observed cluster. They grant access to {stackstate-product-name} data coming from (a *Project* in) the *Cluster*, giving a user permission to read topology, metrics, logs and trace data. Observer:: Grants access to data coming from namespaces in a *Project*. You can use this in the *Project Membership* section of the cluster configuration. Cluster Observer:: Grants access to all data coming from a *Cluster*. You can use this template in the *Cluster Membership* section of the cluster configuration. @@ -171,13 +190,16 @@ The resources in these roles correspond to xref:/setup/security/rbac/rbac_permis * `traces` - spans from the cluster or namespace * `metrics` - metric data originating from the cluster or namespace +Note that access to logs is controlled by the `topology` resource. Note that access to logs is controlled by the `topology` resource. +Enable personalization for users with these observer roles by granting the *Instance Recommended Access* role on the *Project* running {stackstate-product-name}. Enable personalization for users with these observer roles by granting the *Instance Recommended Access* role on the *Project* running {stackstate-product-name}. +== Custom roles == Custom roles -To grant additional permissions beyond Recommended Access, create a custom Project *RoleTemplate* in Rancher, inheriting from *SUSE Observability Instance Recommended Access*. Then, for example, to grant the rights to view monitors and metric charts, add rules with: +To grant additional permissions beyond Recommended Access, create a custom Project *RoleTemplate* in Rancher, inheriting from *SUSE Observability Instance Recommended Access*. Then, for example, to grant the rights to view monitors and metric charts, add rules with: * Verb: `get` * Resource: `metricbindings` and `monitors` @@ -185,7 +207,7 @@ To grant additional permissions beyond Recommended Access, create a custom Proje image::rancher-custom-role.png[Custom RoleTemplate for richer access] -You can specify any resource and verb combination defined in the xref:/setup/security/rbac/rbac_permissions.adoc[RBAC Permissions]. Note that the dashes (`-`) are dropped from resource names, so the permission `get-metric-bindings` becomes the Kubernetes RBAC resource `metricbindings` with the verb `get`. +You can specify any resource and verb combination defined in the xref:/setup/security/rbac/rbac_permissions.adoc[RBAC Permissions]. Note that the dashes (`-`) are dropped from resource names, so the permission `get-metric-bindings` becomes the Kubernetes RBAC resource `metricbindings` with the verb `get`. == Troubleshooting diff --git a/product-docs-common b/product-docs-common index 0231cea5..c31fda22 160000 --- a/product-docs-common +++ b/product-docs-common @@ -1 +1 @@ -Subproject commit 0231cea59bb90f25c61412cffba96d8333a3f034 +Subproject commit c31fda22fd075ddf1eb97b92045d05db8f8a38a7