rancher · dmbarrasuse · Nov 25, 2025 · Nov 14, 2025 · Nov 17, 2025 · Nov 17, 2025
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+ELASTICSEARCH_LOGS=false
 ELASTICSEARCH_LOGS=false
 ELASTICSEARCH_RANGE="7d"
 while getopts "her:" option; do
@@ -24,6 +25,7 @@ EOF
       exit 0;;
      e) # Collect elasticsearch logs
       ELASTICSEARCH_LOGS=true;;
+      ELASTICSEARCH_LOGS=true;;
      r) # Time range for elasticsearch logs
       ELASTICSEARCH_RANGE=$OPTARG;;
     \?) # Invalid option
@@ -50,19 +52,25 @@ done
 
 # skip helm release analysis when not all its dependencies are present
 HELM_RELEASES=true
+HELM_RELEASES=true
 for cmd in base64 gzip jq
 do
   if ! command -v $cmd &>/dev/null; then
      echo "$cmd is not installed. Skipping analysis of helm releases."
      HELM_RELEASES=false
+     HELM_RELEASES=false
   fi
 done
 
 # Check if KUBECONFIG is set
-if [[ -z "$KUBECONFIG" || ! -f "$KUBECONFIG" ]]; then
-    echo "Error: KUBECONFIG is not set. Please ensure KUBECONFIG is set to the path of a valid kubeconfig file before running this script."
-    echo "If kubeconfig is not set, use the command: export KUBECONFIG=PATH-TO-YOUR/kubeconfig. Exiting..."
- exit 1
+if ! kubectl config current-context > /dev/null; then
+  echo "Error: Could not find kubernetes cluster to connect to."
+  echo "Please ensure KUBECONFIG is set to the path of a valid kubeconfig file before running this script."
+  echo "If kubeconfig is not set, use the command: export KUBECONFIG=PATH-TO-YOUR/kubeconfig. Exiting..."
+  exit 1
+else
+  CONTEXT=$(kubectl config current-context)
+  echo "Retrieving logs from kubernetes context: $CONTEXT"
 fi
 
 # Check if namespace exist or not
@@ -71,7 +79,7 @@ if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
     exit 1
 fi
 # Directory to store logs
-OUTPUT_DIR="${NAMESPACE}_logs_$(date +%Y%m%d%H%M%S)"
+OUTPUT_DIR="${NAMESPACE}_logs_$(date -u +%Y-%m-%d_%H-%M-%SZ)"
 ARCHIVE_FILE="${OUTPUT_DIR}.tar.gz"
 
 techo() {
@@ -131,6 +139,20 @@ collect_helm_releases() {
     techo "Collecting helm releases..."
     mkdir -p "$OUTPUT_DIR/releases"
 
+    # Restrict keys extracted from Helm values to only this include-list to avoid including any
+    included_keys='["resources", "affinity", "nodeSelector", "tolerations"]'
+
+    # 1. --argjson keys "$included_keys": Passes the shell variable as a JSON array $keys.
+    # 2. . as $input: Saves the entire original JSON into a variable $input.
+    # 3. [ paths | ... ]: Gathers all paths from the JSON.
+    # 4. select(.[-1] as $last | $keys | index($last)): Selects only paths where
+    #    the last element (.[-1]) is found inside the $keys array.
+    # 5. reduce .[] as $p (null; ...): Starts with an empty (null) document
+    #    and iterates over every path ($p) that was selected.
+    # 6. setpath($p; $input | getpath($p)): For each path, it sets that path
+    #    in the *new* document, pulling the *value* from the original $input.
+
+
     # Restrict keys extracted from Helm values to only this include-list to avoid including any
     included_keys='["resources", "affinity", "nodeSelector", "tolerations"]'
 
@@ -149,6 +171,7 @@ collect_helm_releases() {
         kubectl -n "$NAMESPACE" get secret "$release" -o jsonpath='{.data.release}' | \
           base64 --decode | base64 --decode | gzip -d | \
           jq --argjson keys "$included_keys" '{ info: .info, metadata: .chart.metadata, config: ( .config as $input | [ .config | paths | select(.[-1] as $last | $keys | index($last)) ] | reduce .[] as $p (null; setpath($p; $input | getpath($p)))) }' > "$OUTPUT_DIR/releases/$release"
+          jq --argjson keys "$included_keys" '{ info: .info, metadata: .chart.metadata, config: ( .config as $input | [ .config | paths | select(.[-1] as $last | $keys | index($last)) ] | reduce .[] as $p (null; setpath($p; $input | getpath($p)))) }' > "$OUTPUT_DIR/releases/$release"
     done
 }
 
@@ -290,6 +313,18 @@ collect_hbase_report() {
   fi
 }
 
+collect_workload_observer_data() {
+    techo "Collecting workload observer data..."
+    POD=$(kubectl -n "$NAMESPACE" get pod -l app.kubernetes.io/component=workload-observer -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+    if [ "$POD" == "" ]; then
+      techo "INFO: No workload observer pod found, skipping"
+      return
+    fi
+
+    mkdir -p "$OUTPUT_DIR/workload-observer-data"
+    kubectl -n "$NAMESPACE" cp "$POD:/report-data" "$OUTPUT_DIR/workload-observer-data/" > /dev/null 2>&1 &
+}
+
 archive_and_cleanup() {
     echo "Creating archive $ARCHIVE_FILE..."
     tar -czf "$ARCHIVE_FILE" "$OUTPUT_DIR"
@@ -348,10 +383,14 @@ collect_pod_logs
 collect_pod_disk_usage
 collect_hdfs_report
 collect_hbase_report
+collect_hdfs_report
+collect_hbase_report
 collect_yaml_configs
+collect_workload_observer_data
 if $HELM_RELEASES; then
   collect_helm_releases
 fi
+if $ELASTICSEARCH_LOGS; then
 if $ELASTICSEARCH_LOGS; then
   collect_pod_logs_from_elasticsearch
 fi

@@ -139,6 +139,7 @@
 *** xref:setup/release-notes/v2.6.0.adoc[v2.6.0 - 29/Sep/2025]
 *** xref:setup/release-notes/v2.6.1.adoc[v2.6.1 - 13/Oct/2025]
 *** xref:setup/release-notes/v2.6.2.adoc[v2.6.2 - 03/Nov/2025]
+*** xref:setup/release-notes/v2.6.3.adoc[v2.6.3 - 25/Nov/2025]
 ** xref:setup/upgrade-stackstate/README.adoc[Upgrade SUSE Observability]
 *** xref:setup/upgrade-stackstate/migrate-from-6.adoc[Migration from StackState]
 *** xref:setup/upgrade-stackstate/steps-to-upgrade.adoc[Steps to upgrade]

@@ -24,6 +24,54 @@ Note that by adding more time to the data retention period, the amount of data s
 
 When lowering the retention period, it can take some time until disk space is freed up (at least 15 minutes).
 
+=== Troubleshooting topology disk space issues
+In case of running into disk space issues, a log line - `Not enough replicas was chosen. Reason: {NOT_ENOUGH_STORAGE_SPACE=1` appears in the namenode. Follow the below steps to deal with this scenario:
+
+* Lower the retention, prepare the instance to recover disk space immediately, and trigger a helm upgrade:
+[,yaml]
+----
+stackstate:
+  topology:
+    # Retention set to 1 week in case you are running with the default 1 month
+    retentionHours: 144
+hbase:
+  console:
+    enabled: true
+    replicaCount: 1
+  hdfs:
+    datanode:
+      extraEnv:
+        open:
+          HDFS_CONF_dfs_datanode_du_reserved_pct: "0"
+----
+
+[NOTE]
+====
+Wait until all the hbase and hdfs pods are stable before moving on to the next step.
+====
+
+* Trigger the compaction of historic data:
+[,bash]
+----
+kubectl exec -t --namespace suse-observability  $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediately\(\)\)"
+----
+
+* Follow the progress using:
+----
+kubectl exec -t --namespace suse-observability  $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediatelyStatus\(\)\)"
+----
+
+* In case the budgeted disk space is insufficient, contact <support-portal-link>.
+
+* Restore the settings. Once the status is no longer in progress - `Status(inProgress = false, lastFailure = null)`, trigger a helm upgrade to preserving the new retention as part of your values.
+[,yaml]
+----
+stackstate:
+  topology:
+    # Retention set to 1 week in case you are running with the default 1 month
+    retentionHours: 144
+----
+
 == Retention of events and logs
 
 === SUSE Observability data store

@@ -15,6 +15,10 @@ We provide a dedicated set of Helm values that adjusts all volume sizes to meet
 zookeeper:
   persistence:
     size: 20Gi
+hbase:
+  tephra:
+    persistence:
+      size: 20Gi
 stackstate:
   components:
     checks:
@@ -34,6 +38,9 @@ stackstate:
     vmagent:
       persistence:
         size: 20Gi
+    workloadObserver:
+      persistence:
+        size: 20Gi
   features:
     storeTransactionLogsToPVC:
       volumeSize: 20Gi

@@ -0,0 +1,32 @@
+= v2.6.3 - 25/Nov/2025
+:revdate: 2025-11-25
+:page-revdate: {revdate}
+:description: SUSE Observability Self-hosted
+
+== Release Notes: {stackstate-product-name} Helm Chart v2.6.3
+
+== New Features & Enhancements
+
+* *HDFS Upgrade:* HDFS (Hadoop Distributed File System) and its associated dependencies have been upgraded.
+* *StackPack: Partial Topology Sync Monitor:* A new monitor has been added to the StackState StackPack to alert on **partial Topology Synchronization snapshots**.
+* *vmagent Resource Increase:* The memory and CPU resource requirements for the `vmagent` component have been increased in the `4000-ha` profile.
+* *Image Upgrades:*
+** The **Kafka** container image has been upgraded.
+** The **ClickHouse** container image has been upgraded.
+
+== Bug Fixes
+
+* *OpenTelemetry Metric Scoping:* Fixed a critical issue where metrics ingested via the OpenTelemetry collector were missing the `_scope_` label. This prevented **scoped users** from being able to observe these metrics.
+* *Metric Explorer Sorting:* The **Metric Explorer** now uses numerical sorting for values in the value column.
+* *Platform: StackGraph Corruption (Timed-Out Transactions):* Fixed a **StackGraph corruption issue** where data from timed-out transactions that should have been rolled back could inadvertently reappear.
+* *Platform: State Pod Validation:* Added **additional data validation and logging** to the state pod for improved stability and debugging.
+* *StackGraph: Edge Deletion Invariant:* Added an invariant to prevent inconsistent edge references when performing a delete edge operation in **StackGraph**.
+* *StackGraph Integrity Verifier:* An **experimental perpetual integrity verifier** has been added for StackGraph. It can be enabled by setting `hbase.console.integrity.enabled=true`.
+* *StackPack Remediation Guides:* Fixed several remediation guides within the SUSE Observability stackpack that incorrectly referenced `tags` instead of the correct term, **`labels`**.
+* *Duplicate OpenTelemetry StackPack:* Removed a duplicate **OpenTelemetry stackpack** installation.
+* *Platform: Agent Restart Snapshot Loop:* Fixed an issue where a restart of an agent could cause the **'active snapshot'** to continuously occur.
+* *Platform: Kafka JMX OOM Fix:* Resolved an Out-Of-Memory (OOM) issue for the Kafka JMX container on RKE2 Kubernetes versions 1.31 and 1.30.
+
+=== Agent Bug Fixes
+
+* *Agent: /proc/<pid>/stat Panic:* The agent now includes a fix to prevent a panic when a `/proc/<pid>/stat` file is found to be empty.git c
@@ -16,28 +16,28 @@ For Rancher RBAC to function,
 * the {stackstate-product-name} Agent must have the RBAC Agent enabled and must authenticate using a service token.
 ====
 
-Every authenticated user has the *Instance Basic Access* role that allows them to use the system. These permissions provide access to the views, settings, metric bindings, and lets a user see system notifications. They do NOT grant access to any {stackstate-product-name} data. To see any data, a user needs to be given an additional role. Two directions for extending the *Instance Basic Access* role are provided with Rancher *Role Templates*:
+Every authenticated user has the *Instance Basic Access* role that allows them to use the system.  These permissions provide access to the views, settings, metric bindings, and lets a user see system notifications.  They do NOT grant access to any {stackstate-product-name} data.  In order to see any data, a user needs to be given an additional role. Two directions for extending the *Instance Basic Access* role are provided with Rancher *Role Templates*:
 
-Instance Roles:: Enables you to configure or personalize {stackstate-product-name}.
+Instance Roles:: Enables you to configure or personalize {stacktate-product-name}.
 Scoped Roles:: Grants access to {stackstate-product-name} data from observed clusters.
 
 == Instance roles
 
-You can assign the *Role Templates* for *Instance Roles* to users or groups in the *Project* that is running {stackstate-product-name}. If no instance roles are explicitly assigned to a member of a project, then the permissions of the *Instance Basic Access* role is applied.
+You can assign the *Role Templates* for *Instance Roles* to users or groups in the *Project* that is running {stackstate-product-name}. If no instance roles are explicitly assigned to a member of a project, then they will have the permissions of the *Instance Basic Access* role.
 
 === Instance roles with access to {stackstate-product-name} data
 
-A couple of "global" roles allow access to all {stackstate-product-name} data - in any of the observed clusters. These roles are intended to be used for setting up the system and for troubleshooting system-level problems. For users with any of these roles, it is not necessary to configure xref:scoped[Scoped Roles].
+A couple of "global" roles allow access to all {stackstate-product-name} data - in any of the observed clusters.  These roles are intended to be used for setting up the system and for troubleshooting system-level problems.  For users with any of these roles, it is not necessary to configure xref:scoped[Scoped Roles].
 
 Instance Admin:: Grants full access to all views and all permissions.
-Instance Troubleshooter:: Grants all permissions required to use SUSE Observability for troubleshooting, including the ability to enable/disable monitors, create custom views, and use the CLI. 
-Instance Observer:: Grants access to all data in a SUSE Observability instance. 
+Instance Troubleshooter:: Grants all permissions required to use SUSE Observability for troubleshooting, including the ability to enable/disable monitors, create custom views, and use the CLI.  
+Instance Observer:: Grants access to all data in a SUSE Observability instance.  
 
 === Instance roles without access to {stackstate-product-name} data
 
-These roles need to be combined with the *Instance Observer* role or one of the xref:scoped[Scoped Roles] (see below). Otherwise, no {stackstate-product-name} data is accessible and a "No components found" message appears in the UI. This applies to all Rancher users, including users, such as Project owners.
+These roles need to be combined with the *Instance Observer* role or one of the xref:scoped[Scoped Roles] (see below).  Otherwise, no {stackstate-product-name} data is accessible and the UI will show a "No components found" message.  This applies to all Rancher users, including users, such as Project owners.
 
-Instance Recommended Access:: Grants recommended permissions to use SUSE Observability. This role includes permissions that are not strictly necessary, but provide (limited) means of personalization {stackstate-product-name}.
+Instance Recommended Access:: Grants recommended permissions to use SUSE Observability.  This role includes permissions that are not strictly necessary, but provide (limited) means of personalization {stackstate-product-name}.
 Instance Basic Access:: Grants minimal permissions to use {stackstate-product-name}. This role does not need to be explicitly assigned and there is no *Role Template* for it; every logged-in user has it. 
 
 You can find the permissions assigned to each predefined SUSE Observability role below. For details of the different permissions and how to manage them using the `sts` CLI, see xref:/setup/security/rbac/rbac_permissions.adoc[Role based access control (RBAC) permissions]
@@ -59,12 +59,30 @@ These permissions are granted to all users.
 |views |get
 |===
 
+--
+Basic Access::
++
+--
+Basic access grants minimal permissions for using SUSE Observability. To be combined with an Observer (Instance, Cluster or Project).
+These permissions are granted to all users.
+
+|===
+|Resource |Verbs 
+
+|metric-bindings |get
+|settings |get
+|system-notifications |get
+|views |get
+|===
+
 --
 Recommended Access::
 +
 --
 Recommended access grants permissions that are not strictly necessary, but that make SUSE Observability a lot more useful. It provides a limited degree of personalization.
 To be combined with an Observer (Instance, Cluster or Project).
+Recommended access grants permissions that are not strictly necessary, but that make SUSE Observability a lot more useful. It provides a limited degree of personalization.
+To be combined with an Observer (Instance, Cluster or Project).
 
 |===
 |Resource |Verbs 
@@ -80,7 +98,7 @@ To be combined with an Observer (Instance, Cluster or Project).
 Observer::
 +
 --
-Observer grants access to all observability data in a SUSE Observability instance. Combine with *Recommended Access* for a better experience.
+Observer grants access to all observability data in a SUSE Observability instance.  Combine with *Recommended Access* for a better experience.
 
 |===
 |Resource |Verbs 
@@ -121,6 +139,7 @@ The Troubleshooter role has access to all data available in SUSE Observability a
 
 --
 Admin::
+Admin::
 +
 --
 The Administrator role has all permissions assigned.
@@ -160,7 +179,7 @@ The Administrator role has all permissions assigned.
 [#scoped]
 == Scoped roles
 
-You can assign the following *Role Templates* to users or groups in an observed cluster. They grant access to {stackstate-product-name} data coming from (a *Project* in) the *Cluster*, giving a user permission to read topology, metrics, logs and trace data. 
+You can assign the following *Role Templates* to users or groups in an observed cluster.  They grant access to {stackstate-product-name} data coming from (a *Project* in) the *Cluster*, giving a user permission to read topology, metrics, logs and trace data.  
 
 Observer:: Grants access to data coming from namespaces in a *Project*. You can use this in the *Project Membership* section of the cluster configuration.
 Cluster Observer:: Grants access to all data coming from a *Cluster*. You can use this template in the *Cluster Membership* section of the cluster configuration.
@@ -171,21 +190,24 @@ The resources in these roles correspond to xref:/setup/security/rbac/rbac_permis
 * `traces` - spans from the cluster or namespace
 * `metrics` - metric data originating from the cluster or namespace
 
+Note that access to logs is controlled by the `topology` resource.
 Note that access to logs is controlled by the `topology` resource.
 
+Enable personalization for users with these observer roles by granting the *Instance Recommended Access* role on the *Project* running {stackstate-product-name}.
 Enable personalization for users with these observer roles by granting the *Instance Recommended Access* role on the *Project* running {stackstate-product-name}.
 
+== Custom roles
 == Custom roles
 
-To grant additional permissions beyond Recommended Access, create a custom Project *RoleTemplate* in Rancher, inheriting from *SUSE Observability Instance Recommended Access*. Then, for example, to grant the rights to view monitors and metric charts, add rules with:
+To grant additional permissions beyond Recommended Access, create a custom Project *RoleTemplate* in Rancher, inheriting from *SUSE Observability Instance Recommended Access*.  Then, for example, to grant the rights to view monitors and metric charts, add rules with:
 
 * Verb: `get`
 * Resource: `metricbindings` and `monitors`
 * ApiGroup: `instance.observability.cattle.io`
 
 image::rancher-custom-role.png[Custom RoleTemplate for richer access]
 
-You can specify any resource and verb combination defined in the xref:/setup/security/rbac/rbac_permissions.adoc[RBAC Permissions]. Note that the dashes (`-`) are dropped from resource names, so the permission `get-metric-bindings` becomes the Kubernetes RBAC resource `metricbindings` with the verb `get`.
+You can specify any resource and verb combination defined in the xref:/setup/security/rbac/rbac_permissions.adoc[RBAC Permissions].  Note that the dashes (`-`) are dropped from resource names, so the permission `get-metric-bindings` becomes the Kubernetes RBAC resource `metricbindings` with the verb `get`.
 
 
 == Troubleshooting
+0 −1		comm_prod_map.txt
+0 −12		extensions/unpublish-unlisted-pages/unpublish-unlisted-pages.js
+5 −7		global-attributes.yml