Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
a67d2fc
STAC-23751: Document procedure to lower retention on SG and recover d…
aacevedoosorio Nov 14, 2025
40a132f
Update docs/latest/modules/en/pages/setup/data-management/data_retent…
aacevedoosorio Nov 17, 2025
4c5861c
Update docs/latest/modules/en/pages/setup/data-management/data_retent…
aacevedoosorio Nov 17, 2025
cb83d45
Update docs/latest/modules/en/pages/setup/data-management/data_retent…
aacevedoosorio Nov 17, 2025
cc7d035
Update docs/latest/modules/en/pages/setup/data-management/data_retent…
aacevedoosorio Nov 17, 2025
c87ad41
Update docs/latest/modules/en/pages/setup/data-management/data_retent…
aacevedoosorio Nov 17, 2025
5907594
Update docs/latest/modules/en/pages/setup/data-management/data_retent…
aacevedoosorio Nov 17, 2025
7e6484a
Update docs/latest/modules/en/pages/setup/data-management/data_retent…
aacevedoosorio Nov 17, 2025
4c5b8b5
Merge pull request #119 from aacevedoosorio/stac-23751
akashraj4261 Nov 18, 2025
bd10747
STAC-23748: Add workload observer to logs collector
craffit Nov 19, 2025
893f924
Merge pull request #124 from rancher/stac-23748-workload-logs
akashraj4261 Nov 20, 2025
d859123
STAC-23748: Add workload observer to ack persistent volumes
craffit Nov 20, 2025
756d2ac
STAC-23748: Also fix tephra
craffit Nov 20, 2025
b751b4e
STAC-23748: Typo
craffit Nov 20, 2025
4c3c55f
Merge pull request #125 from rancher/stac-23748-workload-logs
akashraj4261 Nov 21, 2025
93340ce
Mergin staging and main to create release notes (#131)
dmbarrasuse Nov 25, 2025
5990aac
STAC-23862: Add release notes 2.6.3
dmbarrasuse Nov 25, 2025
b091ca6
Merge branch 'main' into STAC-23862-2
dmbarrasuse Nov 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 44 additions & 5 deletions docs/latest/modules/en/attachments/suse-observability_logs_collector.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash

ELASTICSEARCH_LOGS=false
ELASTICSEARCH_LOGS=false
ELASTICSEARCH_RANGE="7d"
while getopts "her:" option; do
Expand All @@ -24,6 +25,7 @@ EOF
exit 0;;
e) # Collect elasticsearch logs
ELASTICSEARCH_LOGS=true;;
ELASTICSEARCH_LOGS=true;;
r) # Time range for elasticsearch logs
ELASTICSEARCH_RANGE=$OPTARG;;
\?) # Invalid option
Expand All @@ -50,19 +52,25 @@ done

# skip helm release analysis when not all its dependencies are present
HELM_RELEASES=true
HELM_RELEASES=true
for cmd in base64 gzip jq
do
if ! command -v $cmd &>/dev/null; then
echo "$cmd is not installed. Skipping analysis of helm releases."
HELM_RELEASES=false
HELM_RELEASES=false
fi
done

# Check if KUBECONFIG is set
if [[ -z "$KUBECONFIG" || ! -f "$KUBECONFIG" ]]; then
echo "Error: KUBECONFIG is not set. Please ensure KUBECONFIG is set to the path of a valid kubeconfig file before running this script."
echo "If kubeconfig is not set, use the command: export KUBECONFIG=PATH-TO-YOUR/kubeconfig. Exiting..."
exit 1
if ! kubectl config current-context > /dev/null; then
echo "Error: Could not find kubernetes cluster to connect to."
echo "Please ensure KUBECONFIG is set to the path of a valid kubeconfig file before running this script."
echo "If kubeconfig is not set, use the command: export KUBECONFIG=PATH-TO-YOUR/kubeconfig. Exiting..."
exit 1
else
CONTEXT=$(kubectl config current-context)
echo "Retrieving logs from kubernetes context: $CONTEXT"
fi

# Check if namespace exist or not
Expand All @@ -71,7 +79,7 @@ if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
exit 1
fi
# Directory to store logs
OUTPUT_DIR="${NAMESPACE}_logs_$(date +%Y%m%d%H%M%S)"
OUTPUT_DIR="${NAMESPACE}_logs_$(date -u +%Y-%m-%d_%H-%M-%SZ)"
ARCHIVE_FILE="${OUTPUT_DIR}.tar.gz"

techo() {
Expand Down Expand Up @@ -131,6 +139,20 @@ collect_helm_releases() {
techo "Collecting helm releases..."
mkdir -p "$OUTPUT_DIR/releases"

# Restrict keys extracted from Helm values to only this include-list to avoid including any
included_keys='["resources", "affinity", "nodeSelector", "tolerations"]'

# 1. --argjson keys "$included_keys": Passes the shell variable as a JSON array $keys.
# 2. . as $input: Saves the entire original JSON into a variable $input.
# 3. [ paths | ... ]: Gathers all paths from the JSON.
# 4. select(.[-1] as $last | $keys | index($last)): Selects only paths where
# the last element (.[-1]) is found inside the $keys array.
# 5. reduce .[] as $p (null; ...): Starts with an empty (null) document
# and iterates over every path ($p) that was selected.
# 6. setpath($p; $input | getpath($p)): For each path, it sets that path
# in the *new* document, pulling the *value* from the original $input.


# Restrict keys extracted from Helm values to only this include-list to avoid including any
included_keys='["resources", "affinity", "nodeSelector", "tolerations"]'

Expand All @@ -149,6 +171,7 @@ collect_helm_releases() {
kubectl -n "$NAMESPACE" get secret "$release" -o jsonpath='{.data.release}' | \
base64 --decode | base64 --decode | gzip -d | \
jq --argjson keys "$included_keys" '{ info: .info, metadata: .chart.metadata, config: ( .config as $input | [ .config | paths | select(.[-1] as $last | $keys | index($last)) ] | reduce .[] as $p (null; setpath($p; $input | getpath($p)))) }' > "$OUTPUT_DIR/releases/$release"
jq --argjson keys "$included_keys" '{ info: .info, metadata: .chart.metadata, config: ( .config as $input | [ .config | paths | select(.[-1] as $last | $keys | index($last)) ] | reduce .[] as $p (null; setpath($p; $input | getpath($p)))) }' > "$OUTPUT_DIR/releases/$release"
done
}

Expand Down Expand Up @@ -290,6 +313,18 @@ collect_hbase_report() {
fi
}

collect_workload_observer_data() {
techo "Collecting workload observer data..."
POD=$(kubectl -n "$NAMESPACE" get pod -l app.kubernetes.io/component=workload-observer -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
if [ "$POD" == "" ]; then
techo "INFO: No workload observer pod found, skipping"
return
fi

mkdir -p "$OUTPUT_DIR/workload-observer-data"
kubectl -n "$NAMESPACE" cp "$POD:/report-data" "$OUTPUT_DIR/workload-observer-data/" > /dev/null 2>&1 &
}

archive_and_cleanup() {
echo "Creating archive $ARCHIVE_FILE..."
tar -czf "$ARCHIVE_FILE" "$OUTPUT_DIR"
Expand Down Expand Up @@ -348,10 +383,14 @@ collect_pod_logs
collect_pod_disk_usage
collect_hdfs_report
collect_hbase_report
collect_hdfs_report
collect_hbase_report
collect_yaml_configs
collect_workload_observer_data
if $HELM_RELEASES; then
collect_helm_releases
fi
if $ELASTICSEARCH_LOGS; then
if $ELASTICSEARCH_LOGS; then
collect_pod_logs_from_elasticsearch
fi
Expand Down
1 change: 1 addition & 0 deletions docs/latest/modules/en/nav.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@
*** xref:setup/release-notes/v2.6.0.adoc[v2.6.0 - 29/Sep/2025]
*** xref:setup/release-notes/v2.6.1.adoc[v2.6.1 - 13/Oct/2025]
*** xref:setup/release-notes/v2.6.2.adoc[v2.6.2 - 03/Nov/2025]
*** xref:setup/release-notes/v2.6.3.adoc[v2.6.3 - 25/Nov/2025]
** xref:setup/upgrade-stackstate/README.adoc[Upgrade SUSE Observability]
*** xref:setup/upgrade-stackstate/migrate-from-6.adoc[Migration from StackState]
*** xref:setup/upgrade-stackstate/steps-to-upgrade.adoc[Steps to upgrade]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,54 @@ Note that by adding more time to the data retention period, the amount of data s

When lowering the retention period, it can take some time until disk space is freed up (at least 15 minutes).

=== Troubleshooting topology disk space issues
In case of running into disk space issues, a log line - `Not enough replicas was chosen. Reason: {NOT_ENOUGH_STORAGE_SPACE=1` appears in the namenode. Follow the below steps to deal with this scenario:

* Lower the retention, prepare the instance to recover disk space immediately, and trigger a helm upgrade:
[,yaml]
----
stackstate:
topology:
# Retention set to 1 week in case you are running with the default 1 month
retentionHours: 144
hbase:
console:
enabled: true
replicaCount: 1
hdfs:
datanode:
extraEnv:
open:
HDFS_CONF_dfs_datanode_du_reserved_pct: "0"
----

[NOTE]
====
Wait until all the hbase and hdfs pods are stable before moving on to the next step.
====

* Trigger the compaction of historic data:
[,bash]
----
kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediately\(\)\)"
----

* Follow the progress using:
----
kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediatelyStatus\(\)\)"
----

* In case the budgeted disk space is insufficient, contact <support-portal-link>.

* Restore the settings. Once the status is no longer in progress - `Status(inProgress = false, lastFailure = null)`, trigger a helm upgrade to preserving the new retention as part of your values.
[,yaml]
----
stackstate:
topology:
# Retention set to 1 week in case you are running with the default 1 month
retentionHours: 144
----

== Retention of events and logs

=== SUSE Observability data store
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ We provide a dedicated set of Helm values that adjusts all volume sizes to meet
zookeeper:
persistence:
size: 20Gi
hbase:
tephra:
persistence:
size: 20Gi
stackstate:
components:
checks:
Expand All @@ -34,6 +38,9 @@ stackstate:
vmagent:
persistence:
size: 20Gi
workloadObserver:
persistence:
size: 20Gi
features:
storeTransactionLogsToPVC:
volumeSize: 20Gi
Expand Down
32 changes: 32 additions & 0 deletions docs/latest/modules/en/pages/setup/release-notes/v2.6.3.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
= v2.6.3 - 25/Nov/2025
:revdate: 2025-11-25
:page-revdate: {revdate}
:description: SUSE Observability Self-hosted

== Release Notes: {stackstate-product-name} Helm Chart v2.6.3

== New Features & Enhancements

* *HDFS Upgrade:* HDFS (Hadoop Distributed File System) and its associated dependencies have been upgraded.
* *StackPack: Partial Topology Sync Monitor:* A new monitor has been added to the StackState StackPack to alert on **partial Topology Synchronization snapshots**.
* *vmagent Resource Increase:* The memory and CPU resource requirements for the `vmagent` component have been increased in the `4000-ha` profile.
* *Image Upgrades:*
** The **Kafka** container image has been upgraded.
** The **ClickHouse** container image has been upgraded.

== Bug Fixes

* *OpenTelemetry Metric Scoping:* Fixed a critical issue where metrics ingested via the OpenTelemetry collector were missing the `_scope_` label. This prevented **scoped users** from being able to observe these metrics.
* *Metric Explorer Sorting:* The **Metric Explorer** now uses numerical sorting for values in the value column.
* *Platform: StackGraph Corruption (Timed-Out Transactions):* Fixed a **StackGraph corruption issue** where data from timed-out transactions that should have been rolled back could inadvertently reappear.
* *Platform: State Pod Validation:* Added **additional data validation and logging** to the state pod for improved stability and debugging.
* *StackGraph: Edge Deletion Invariant:* Added an invariant to prevent inconsistent edge references when performing a delete edge operation in **StackGraph**.
* *StackGraph Integrity Verifier:* An **experimental perpetual integrity verifier** has been added for StackGraph. It can be enabled by setting `hbase.console.integrity.enabled=true`.
* *StackPack Remediation Guides:* Fixed several remediation guides within the SUSE Observability stackpack that incorrectly referenced `tags` instead of the correct term, **`labels`**.
* *Duplicate OpenTelemetry StackPack:* Removed a duplicate **OpenTelemetry stackpack** installation.
* *Platform: Agent Restart Snapshot Loop:* Fixed an issue where a restart of an agent could cause the **'active snapshot'** to continuously occur.
* *Platform: Kafka JMX OOM Fix:* Resolved an Out-Of-Memory (OOM) issue for the Kafka JMX container on RKE2 Kubernetes versions 1.31 and 1.30.

=== Agent Bug Fixes

* *Agent: /proc/<pid>/stat Panic:* The agent now includes a fix to prevent a panic when a `/proc/<pid>/stat` file is found to be empty.git c
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,28 @@ For Rancher RBAC to function,
* the {stackstate-product-name} Agent must have the RBAC Agent enabled and must authenticate using a service token.
====

Every authenticated user has the *Instance Basic Access* role that allows them to use the system. These permissions provide access to the views, settings, metric bindings, and lets a user see system notifications. They do NOT grant access to any {stackstate-product-name} data. To see any data, a user needs to be given an additional role. Two directions for extending the *Instance Basic Access* role are provided with Rancher *Role Templates*:
Every authenticated user has the *Instance Basic Access* role that allows them to use the system. These permissions provide access to the views, settings, metric bindings, and lets a user see system notifications. They do NOT grant access to any {stackstate-product-name} data. In order to see any data, a user needs to be given an additional role. Two directions for extending the *Instance Basic Access* role are provided with Rancher *Role Templates*:

Instance Roles:: Enables you to configure or personalize {stackstate-product-name}.
Instance Roles:: Enables you to configure or personalize {stacktate-product-name}.
Scoped Roles:: Grants access to {stackstate-product-name} data from observed clusters.

== Instance roles

You can assign the *Role Templates* for *Instance Roles* to users or groups in the *Project* that is running {stackstate-product-name}. If no instance roles are explicitly assigned to a member of a project, then the permissions of the *Instance Basic Access* role is applied.
You can assign the *Role Templates* for *Instance Roles* to users or groups in the *Project* that is running {stackstate-product-name}. If no instance roles are explicitly assigned to a member of a project, then they will have the permissions of the *Instance Basic Access* role.

=== Instance roles with access to {stackstate-product-name} data

A couple of "global" roles allow access to all {stackstate-product-name} data - in any of the observed clusters. These roles are intended to be used for setting up the system and for troubleshooting system-level problems. For users with any of these roles, it is not necessary to configure xref:scoped[Scoped Roles].
A couple of "global" roles allow access to all {stackstate-product-name} data - in any of the observed clusters. These roles are intended to be used for setting up the system and for troubleshooting system-level problems. For users with any of these roles, it is not necessary to configure xref:scoped[Scoped Roles].

Instance Admin:: Grants full access to all views and all permissions.
Instance Troubleshooter:: Grants all permissions required to use SUSE Observability for troubleshooting, including the ability to enable/disable monitors, create custom views, and use the CLI.
Instance Observer:: Grants access to all data in a SUSE Observability instance.
Instance Troubleshooter:: Grants all permissions required to use SUSE Observability for troubleshooting, including the ability to enable/disable monitors, create custom views, and use the CLI.
Instance Observer:: Grants access to all data in a SUSE Observability instance.

=== Instance roles without access to {stackstate-product-name} data

These roles need to be combined with the *Instance Observer* role or one of the xref:scoped[Scoped Roles] (see below). Otherwise, no {stackstate-product-name} data is accessible and a "No components found" message appears in the UI. This applies to all Rancher users, including users, such as Project owners.
These roles need to be combined with the *Instance Observer* role or one of the xref:scoped[Scoped Roles] (see below). Otherwise, no {stackstate-product-name} data is accessible and the UI will show a "No components found" message. This applies to all Rancher users, including users, such as Project owners.

Instance Recommended Access:: Grants recommended permissions to use SUSE Observability. This role includes permissions that are not strictly necessary, but provide (limited) means of personalization {stackstate-product-name}.
Instance Recommended Access:: Grants recommended permissions to use SUSE Observability. This role includes permissions that are not strictly necessary, but provide (limited) means of personalization {stackstate-product-name}.
Instance Basic Access:: Grants minimal permissions to use {stackstate-product-name}. This role does not need to be explicitly assigned and there is no *Role Template* for it; every logged-in user has it.

You can find the permissions assigned to each predefined SUSE Observability role below. For details of the different permissions and how to manage them using the `sts` CLI, see xref:/setup/security/rbac/rbac_permissions.adoc[Role based access control (RBAC) permissions]
Expand All @@ -59,12 +59,30 @@ These permissions are granted to all users.
|views |get
|===

--
Basic Access::
+
--
Basic access grants minimal permissions for using SUSE Observability. To be combined with an Observer (Instance, Cluster or Project).
These permissions are granted to all users.

|===
|Resource |Verbs

|metric-bindings |get
|settings |get
|system-notifications |get
|views |get
|===

--
Recommended Access::
+
--
Recommended access grants permissions that are not strictly necessary, but that make SUSE Observability a lot more useful. It provides a limited degree of personalization.
To be combined with an Observer (Instance, Cluster or Project).
Recommended access grants permissions that are not strictly necessary, but that make SUSE Observability a lot more useful. It provides a limited degree of personalization.
To be combined with an Observer (Instance, Cluster or Project).

|===
|Resource |Verbs
Expand All @@ -80,7 +98,7 @@ To be combined with an Observer (Instance, Cluster or Project).
Observer::
+
--
Observer grants access to all observability data in a SUSE Observability instance. Combine with *Recommended Access* for a better experience.
Observer grants access to all observability data in a SUSE Observability instance. Combine with *Recommended Access* for a better experience.

|===
|Resource |Verbs
Expand Down Expand Up @@ -121,6 +139,7 @@ The Troubleshooter role has access to all data available in SUSE Observability a

--
Admin::
Admin::
+
--
The Administrator role has all permissions assigned.
Expand Down Expand Up @@ -160,7 +179,7 @@ The Administrator role has all permissions assigned.
[#scoped]
== Scoped roles

You can assign the following *Role Templates* to users or groups in an observed cluster. They grant access to {stackstate-product-name} data coming from (a *Project* in) the *Cluster*, giving a user permission to read topology, metrics, logs and trace data.
You can assign the following *Role Templates* to users or groups in an observed cluster. They grant access to {stackstate-product-name} data coming from (a *Project* in) the *Cluster*, giving a user permission to read topology, metrics, logs and trace data.

Observer:: Grants access to data coming from namespaces in a *Project*. You can use this in the *Project Membership* section of the cluster configuration.
Cluster Observer:: Grants access to all data coming from a *Cluster*. You can use this template in the *Cluster Membership* section of the cluster configuration.
Expand All @@ -171,21 +190,24 @@ The resources in these roles correspond to xref:/setup/security/rbac/rbac_permis
* `traces` - spans from the cluster or namespace
* `metrics` - metric data originating from the cluster or namespace

Note that access to logs is controlled by the `topology` resource.
Note that access to logs is controlled by the `topology` resource.

Enable personalization for users with these observer roles by granting the *Instance Recommended Access* role on the *Project* running {stackstate-product-name}.
Enable personalization for users with these observer roles by granting the *Instance Recommended Access* role on the *Project* running {stackstate-product-name}.

== Custom roles
== Custom roles

To grant additional permissions beyond Recommended Access, create a custom Project *RoleTemplate* in Rancher, inheriting from *SUSE Observability Instance Recommended Access*. Then, for example, to grant the rights to view monitors and metric charts, add rules with:
To grant additional permissions beyond Recommended Access, create a custom Project *RoleTemplate* in Rancher, inheriting from *SUSE Observability Instance Recommended Access*. Then, for example, to grant the rights to view monitors and metric charts, add rules with:

* Verb: `get`
* Resource: `metricbindings` and `monitors`
* ApiGroup: `instance.observability.cattle.io`

image::rancher-custom-role.png[Custom RoleTemplate for richer access]

You can specify any resource and verb combination defined in the xref:/setup/security/rbac/rbac_permissions.adoc[RBAC Permissions]. Note that the dashes (`-`) are dropped from resource names, so the permission `get-metric-bindings` becomes the Kubernetes RBAC resource `metricbindings` with the verb `get`.
You can specify any resource and verb combination defined in the xref:/setup/security/rbac/rbac_permissions.adoc[RBAC Permissions]. Note that the dashes (`-`) are dropped from resource names, so the permission `get-metric-bindings` becomes the Kubernetes RBAC resource `metricbindings` with the verb `get`.


== Troubleshooting
Expand Down