From a67d2fc0e03c0964e79898005489fe19026044f9 Mon Sep 17 00:00:00 2001 From: Alejandro Acevedo Date: Fri, 14 Nov 2025 17:24:37 +0100 Subject: [PATCH 1/8] STAC-23751: Document procedure to lower retention on SG and recover data immediately --- .../setup/data-management/data_retention.adoc | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index 2f8d1d95..bb232e7a 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -24,6 +24,54 @@ Note that by adding more time to the data retention period, the amount of data s When lowering the retention period, it can take some time until disk space is freed up (at least 15 minutes). +=== Troubleshooting topology disk space issues. +In case of running into disk space issues, we usually find in the namenode a log line like `Not enough replicas was chosen. Reason: {NOT_ENOUGH_STORAGE_SPACE=1`, to deal with this scenario: + +* Lower the retention and prepare the instance to recover disk space immediately. Trigger a helm upgrade with: +[,yaml] +---- +stackstate: + topology: + # Retention set to 1 week in case you are running with the default 1 month + retentionHours: 144 +hbase: + console: + enabled: true + replicaCount: 1 + hdfs: + datanode: + extraEnv: + open: + HDFS_CONF_dfs_datanode_du_reserved_pct: "0" +---- + +[NOTE] +==== +Wait until all the hbase and hdfs pods are stable before moving on to the next step. +==== + +* Trigger the compaction of historic data +[,bash] +---- +kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediately\(\)\)" +---- + +* Follow the progress using +---- +kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediatelyStatus\(\)\)" +---- + +* Contact support to analyze why the budgeted disk space was insufficient. + +* Restore the settings. Once the status is no longer inProgress `Status(inProgress = false, lastFailure = null)` trigger a helm upgrade just preserving the new retention as part of your values. +[,yaml] +---- +stackstate: + topology: + # Retention set to 1 week in case you are running with the default 1 month + retentionHours: 144 +---- + == Retention of events and logs === SUSE Observability data store From 40a132f06933045301dcf1e3d04d9113545df92c Mon Sep 17 00:00:00 2001 From: Alejandro Acevedo Date: Mon, 17 Nov 2025 11:58:48 +0100 Subject: [PATCH 2/8] Update docs/latest/modules/en/pages/setup/data-management/data_retention.adoc Co-authored-by: akashraj4261 --- .../modules/en/pages/setup/data-management/data_retention.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index bb232e7a..2239c739 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -24,7 +24,7 @@ Note that by adding more time to the data retention period, the amount of data s When lowering the retention period, it can take some time until disk space is freed up (at least 15 minutes). -=== Troubleshooting topology disk space issues. +=== Troubleshooting topology disk space issues In case of running into disk space issues, we usually find in the namenode a log line like `Not enough replicas was chosen. Reason: {NOT_ENOUGH_STORAGE_SPACE=1`, to deal with this scenario: * Lower the retention and prepare the instance to recover disk space immediately. Trigger a helm upgrade with: From 4c5861c9abf32403b5e4368f2d6a4eca1a61b55f Mon Sep 17 00:00:00 2001 From: Alejandro Acevedo Date: Mon, 17 Nov 2025 11:59:05 +0100 Subject: [PATCH 3/8] Update docs/latest/modules/en/pages/setup/data-management/data_retention.adoc Co-authored-by: akashraj4261 --- .../modules/en/pages/setup/data-management/data_retention.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index 2239c739..cbd7683d 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -25,7 +25,7 @@ Note that by adding more time to the data retention period, the amount of data s When lowering the retention period, it can take some time until disk space is freed up (at least 15 minutes). === Troubleshooting topology disk space issues -In case of running into disk space issues, we usually find in the namenode a log line like `Not enough replicas was chosen. Reason: {NOT_ENOUGH_STORAGE_SPACE=1`, to deal with this scenario: +In case of running into disk space issues, a log line - `Not enough replicas was chosen. Reason: {NOT_ENOUGH_STORAGE_SPACE=1` appears in the namenode. Follow the below steps to deal with this scenario: * Lower the retention and prepare the instance to recover disk space immediately. Trigger a helm upgrade with: [,yaml] From cb83d45aaa58e44be662e641a5d41eeec5b5f5a1 Mon Sep 17 00:00:00 2001 From: Alejandro Acevedo Date: Mon, 17 Nov 2025 11:59:12 +0100 Subject: [PATCH 4/8] Update docs/latest/modules/en/pages/setup/data-management/data_retention.adoc Co-authored-by: akashraj4261 --- .../modules/en/pages/setup/data-management/data_retention.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index cbd7683d..e49608a7 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -27,7 +27,7 @@ When lowering the retention period, it can take some time until disk space is fr === Troubleshooting topology disk space issues In case of running into disk space issues, a log line - `Not enough replicas was chosen. Reason: {NOT_ENOUGH_STORAGE_SPACE=1` appears in the namenode. Follow the below steps to deal with this scenario: -* Lower the retention and prepare the instance to recover disk space immediately. Trigger a helm upgrade with: +* Lower the retention, prepare the instance to recover disk space immediately, and trigger a helm upgrade: [,yaml] ---- stackstate: From cc7d03585d3441087a6f68e28633aeea408db1f2 Mon Sep 17 00:00:00 2001 From: Alejandro Acevedo Date: Mon, 17 Nov 2025 11:59:18 +0100 Subject: [PATCH 5/8] Update docs/latest/modules/en/pages/setup/data-management/data_retention.adoc Co-authored-by: akashraj4261 --- .../modules/en/pages/setup/data-management/data_retention.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index e49608a7..15005fbf 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -56,7 +56,7 @@ Wait until all the hbase and hdfs pods are stable before moving on to the next s kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediately\(\)\)" ---- -* Follow the progress using +* Follow the progress using: ---- kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediatelyStatus\(\)\)" ---- From c87ad41e38e65890c4450d9f9d4e4a9bfd8d9eaf Mon Sep 17 00:00:00 2001 From: Alejandro Acevedo Date: Mon, 17 Nov 2025 11:59:26 +0100 Subject: [PATCH 6/8] Update docs/latest/modules/en/pages/setup/data-management/data_retention.adoc Co-authored-by: akashraj4261 --- .../modules/en/pages/setup/data-management/data_retention.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index 15005fbf..a4b8e9c8 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -61,7 +61,7 @@ kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace s kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediatelyStatus\(\)\)" ---- -* Contact support to analyze why the budgeted disk space was insufficient. +* In case the budgeted disk space is insufficient, contact . * Restore the settings. Once the status is no longer inProgress `Status(inProgress = false, lastFailure = null)` trigger a helm upgrade just preserving the new retention as part of your values. [,yaml] From 5907594b724694a0d1ae1af78ac00787912fa4f5 Mon Sep 17 00:00:00 2001 From: Alejandro Acevedo Date: Mon, 17 Nov 2025 11:59:32 +0100 Subject: [PATCH 7/8] Update docs/latest/modules/en/pages/setup/data-management/data_retention.adoc Co-authored-by: akashraj4261 --- .../modules/en/pages/setup/data-management/data_retention.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index a4b8e9c8..36f0ca68 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -50,7 +50,7 @@ hbase: Wait until all the hbase and hdfs pods are stable before moving on to the next step. ==== -* Trigger the compaction of historic data +* Trigger the compaction of historic data: [,bash] ---- kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace suse-observability --no-headers | grep "console" | awk '{print $1}' | head -n 1) -- /bin/bash -c "stackgraph-console run println\(retention.removeExpiredDataImmediately\(\)\)" From 7e6484a76d6d9824f74661777923ab7ade596e66 Mon Sep 17 00:00:00 2001 From: Alejandro Acevedo Date: Mon, 17 Nov 2025 12:01:10 +0100 Subject: [PATCH 8/8] Update docs/latest/modules/en/pages/setup/data-management/data_retention.adoc Co-authored-by: akashraj4261 --- .../modules/en/pages/setup/data-management/data_retention.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc index 36f0ca68..dc02eb18 100644 --- a/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc +++ b/docs/latest/modules/en/pages/setup/data-management/data_retention.adoc @@ -63,7 +63,7 @@ kubectl exec -t --namespace suse-observability $(kubectl get pods --namespace s * In case the budgeted disk space is insufficient, contact . -* Restore the settings. Once the status is no longer inProgress `Status(inProgress = false, lastFailure = null)` trigger a helm upgrade just preserving the new retention as part of your values. +* Restore the settings. Once the status is no longer in progress - `Status(inProgress = false, lastFailure = null)`, trigger a helm upgrade to preserving the new retention as part of your values. [,yaml] ---- stackstate: