From d2e08ceb7397d541f6aad2866304a71d9e01bf56 Mon Sep 17 00:00:00 2001 From: Victor Palma Date: Wed, 29 Oct 2025 19:37:08 -0500 Subject: [PATCH 01/10] feat: add opentelemetry-kube-stack service configuration - Add OpenTelemetry Kube Stack Helm release configuration - Include hardened values for v0.11.1 with security settings - Configure observability namespace and RBAC - Add comprehensive telemetry collection and processing --- .../opentelemetry-kube-stack/README.md | 204 +++++++++++++++ .../helm-values/hardened-values-v0.11.1.yaml | 233 ++++++++++++++++++ .../opentelemetry-kube-stack/helmrelease.yaml | 37 +++ .../kustomization.yaml | 14 ++ .../opentelemetry-kube-stack/namespace.yaml | 5 + .../opentelemetry-kube-stack/source.yaml | 8 + 6 files changed, 501 insertions(+) create mode 100644 applications/base/services/opentelemetry-kube-stack/README.md create mode 100644 applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml create mode 100644 applications/base/services/opentelemetry-kube-stack/helmrelease.yaml create mode 100644 applications/base/services/opentelemetry-kube-stack/kustomization.yaml create mode 100644 applications/base/services/opentelemetry-kube-stack/namespace.yaml create mode 100644 applications/base/services/opentelemetry-kube-stack/source.yaml diff --git a/applications/base/services/opentelemetry-kube-stack/README.md b/applications/base/services/opentelemetry-kube-stack/README.md new file mode 100644 index 0000000..b4f636c --- /dev/null +++ b/applications/base/services/opentelemetry-kube-stack/README.md @@ -0,0 +1,204 @@ +# OpenTelemetry Kube Stack + +The OpenTelemetry Kube Stack is a comprehensive observability solution that provides a complete OpenTelemetry setup for Kubernetes clusters. It includes the OpenTelemetry Operator, collectors, and essential monitoring components. + +## Overview + +This chart deploys: +- **OpenTelemetry Operator**: Manages OpenTelemetry collectors and instrumentation +- **OpenTelemetry Collector**: Collects, processes, and exports telemetry data +- **Kube State Metrics**: Exposes cluster-level metrics about Kubernetes objects +- **Node Exporter**: Collects hardware and OS metrics from cluster nodes + +## Configuration + +### Chart Information +- **Chart**: opentelemetry-kube-stack +- **Version**: 0.11.1 +- **App Version**: 0.129.1 +- **Repository**: https://open-telemetry.github.io/opentelemetry-helm-charts + +### Namespace +Deployed in the `observability` namespace alongside other monitoring components. + +### Security Hardening + +The deployment includes comprehensive security configurations: + +#### Container Security +- Non-root execution (`runAsNonRoot: true`) +- Specific user ID (`runAsUser: 65534`) +- Security profiles (`seccompProfile.type: RuntimeDefault`) +- Capability dropping (`capabilities.drop: [ALL]`) +- Read-only root filesystem (`readOnlyRootFilesystem: true`) +- Privilege escalation disabled (`allowPrivilegeEscalation: false`) + +#### Resource Management +- CPU and memory limits defined for all components +- Resource requests set for proper scheduling +- Memory limiter processor configured for collectors + +#### Network Security +- OTLP receivers configured on standard ports (4317/4318) +- Service monitors enabled for Prometheus integration +- Node selectors for Linux-only deployment + +### Key Features + +#### OpenTelemetry Operator +- Manages collector lifecycle and configuration +- Supports auto-instrumentation for applications +- Webhook-based configuration validation + +#### Collector Configuration +- OTLP receivers for traces, metrics, and logs +- Batch processing for efficient data handling +- Memory limiting to prevent resource exhaustion +- Logging exporter for initial setup (can be customized) + +#### Monitoring Integration +- Prometheus ServiceMonitor resources enabled +- Kube State Metrics for cluster-level observability +- Node Exporter for infrastructure metrics +- Compatible with existing Prometheus stack + +### Customization + +#### Collector Configuration +The default collector configuration can be extended by modifying the `config` section in the hardened values file. Common customizations include: + +```yaml +config: + exporters: + otlp: + endpoint: "your-backend:4317" + tls: + insecure: false + prometheusremotewrite: + endpoint: "https://prometheus.example.com/api/v1/write" +``` + +#### Resource Scaling +Adjust resource limits based on cluster size and telemetry volume: + +```yaml +resources: + requests: + memory: "256Mi" + cpu: "200m" + limits: + memory: "1Gi" + cpu: "1000m" +``` + +### Dependencies + +This chart has dependencies on: +- OpenTelemetry CRDs (installed automatically) +- Kubernetes 1.19+ for proper ServiceMonitor support +- Prometheus Operator (for ServiceMonitor resources) + +### Compatibility + +#### Existing Services +The configuration is designed to work alongside existing observability services: +- **kube-prometheus-stack**: Kubernetes service monitors disabled to avoid conflicts +- **Prometheus CRDs**: Installation disabled (uses existing CRDs) +- **Grafana**: Compatible with OpenTelemetry data sources + +#### OpenTelemetry Operator +This deployment may conflict with the existing `opentelemetry-operator` service. Consider: +- Using this as a replacement for the standalone operator +- Disabling the operator component if only collectors are needed +- Coordinating CRD management between deployments + +### Monitoring and Observability + +#### Health Checks +Monitor the deployment status: +```bash +kubectl get helmrelease opentelemetry-kube-stack -n observability +kubectl get pods -n observability -l app.kubernetes.io/name=opentelemetry-kube-stack +``` + +#### Collector Status +Check OpenTelemetry collector status: +```bash +kubectl get opentelemetrycollector -n observability +kubectl logs -n observability -l app.kubernetes.io/component=opentelemetry-collector +``` + +#### Metrics Availability +Verify metrics collection: +```bash +kubectl port-forward -n observability svc/opentelemetry-kube-stack-collector 8888:8888 +curl http://localhost:8888/metrics +``` + +### Troubleshooting + +#### Common Issues + +1. **CRD Conflicts**: If OpenTelemetry CRDs already exist, disable installation: + ```yaml + crds: + installOtel: false + ``` + +2. **Resource Constraints**: Increase resource limits if collectors are OOMKilled: + ```yaml + resources: + limits: + memory: "1Gi" + ``` + +3. **Webhook Failures**: If admission webhooks cause issues: + ```yaml + opentelemetry-operator: + admissionWebhooks: + failurePolicy: "Ignore" + ``` + +#### Debug Commands +```bash +# Check operator logs +kubectl logs -n observability -l app.kubernetes.io/name=opentelemetry-operator + +# Describe collector resources +kubectl describe opentelemetrycollector -n observability + +# Check service monitor status +kubectl get servicemonitor -n observability +``` + +### Integration Examples + +#### Application Instrumentation +Enable auto-instrumentation for applications: +```yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: my-instrumentation +spec: + exporter: + endpoint: http://opentelemetry-kube-stack-collector:4317 + propagators: + - tracecontext + - baggage +``` + +#### Custom Exporters +Configure exporters for your observability backend: +```yaml +config: + exporters: + jaeger: + endpoint: jaeger-collector:14250 + tls: + insecure: true + prometheus: + endpoint: "0.0.0.0:8889" +``` + +This deployment provides a solid foundation for OpenTelemetry-based observability in Kubernetes environments with enterprise-grade security and monitoring capabilities. \ No newline at end of file diff --git a/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml b/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml new file mode 100644 index 0000000..1c11fdd --- /dev/null +++ b/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml @@ -0,0 +1,233 @@ +# Security configurations for OpenTelemetry Kube Stack +# Version: 0.11.1 + +# Cluster name for identification +clusterName: "openCenter-cluster" + +# OpenTelemetry Operator configuration +opentelemetry-operator: + enabled: true + manager: + # Security context for operator manager + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + # Resource limits for operator + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "500m" + # Admission webhooks configuration + admissionWebhooks: + failurePolicy: "Ignore" + # Security context for webhooks + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + +# Default collector configuration with security hardening +defaultCRConfig: + enabled: true + mode: deployment + replicas: 2 + + # Security contexts + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + + podSecurityContext: + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + seccompProfile: + type: RuntimeDefault + + # Container security context + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + + # Resource limits + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + + # Node selector for Linux nodes + nodeSelector: + kubernetes.io/os: linux + + # Basic OTLP configuration + config: + receivers: + otlp: + protocols: + grpc: + endpoint: ${env:MY_POD_IP}:4317 + http: + endpoint: ${env:MY_POD_IP}:4318 + processors: + batch: + timeout: 1s + send_batch_size: 1024 + memory_limiter: + limit_mib: 400 + spike_limit_mib: 100 + check_interval: 5s + exporters: + logging: + loglevel: info + service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [logging] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [logging] + logs: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [logging] + +# Kube State Metrics configuration +kubeStateMetrics: + enabled: true + +kube-state-metrics: + # Security context + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + # Resource limits + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "200m" + # Node selector + nodeSelector: + kubernetes.io/os: linux + # Prometheus monitoring + prometheus: + monitor: + enabled: true + honorLabels: true + +# Node Exporter configuration +nodeExporter: + enabled: true + +prometheus-node-exporter: + # Security context + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + # Resource limits + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "200m" + # Node selector + nodeSelector: + kubernetes.io/os: linux + # Prometheus monitoring + prometheus: + monitor: + enabled: true + jobLabel: node-exporter + +# Kubernetes service monitors (disabled to avoid conflicts with existing monitoring) +kubernetesServiceMonitors: + enabled: false + +# Individual component monitors (disabled to avoid conflicts) +kubeApiServer: + enabled: false +kubelet: + enabled: false +kubeControllerManager: + enabled: false +coreDns: + enabled: false +kubeEtcd: + enabled: false +kubeScheduler: + enabled: false +kubeProxy: + enabled: false + +# CRDs installation +crds: + installOtel: true + installPrometheus: false # Disabled to avoid conflicts with existing Prometheus stack + +# Cleanup job configuration +cleanupJob: + enabled: true + image: + repository: rancher/kubectl + tag: v1.34.1 + # Security context for cleanup job + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-kube-stack/helmrelease.yaml b/applications/base/services/opentelemetry-kube-stack/helmrelease.yaml new file mode 100644 index 0000000..722e55e --- /dev/null +++ b/applications/base/services/opentelemetry-kube-stack/helmrelease.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: opentelemetry-kube-stack + namespace: observability +spec: + releaseName: opentelemetry-kube-stack + interval: 5m + timeout: 10m + driftDetection: + mode: enabled + install: + remediation: + retries: 3 + remediateLastFailure: true + upgrade: + remediation: + retries: 0 + remediateLastFailure: false + targetNamespace: observability + chart: + spec: + chart: opentelemetry-kube-stack + version: 0.11.1 + sourceRef: + kind: HelmRepository + name: opentelemetry + namespace: observability + valuesFrom: + - kind: Secret + name: opentelemetry-kube-stack-values-base + valuesKey: hardened.yaml + - kind: Secret + name: opentelemetry-kube-stack-values-override + valuesKey: override.yaml + optional: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-kube-stack/kustomization.yaml b/applications/base/services/opentelemetry-kube-stack/kustomization.yaml new file mode 100644 index 0000000..c30089d --- /dev/null +++ b/applications/base/services/opentelemetry-kube-stack/kustomization.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - "./namespace.yaml" + - "./source.yaml" + - "./helmrelease.yaml" + +secretGenerator: + - name: opentelemetry-kube-stack-values-base + type: Opaque + files: [hardened.yaml=helm-values/hardened-values-v0.11.1.yaml] + options: + disableNameSuffixHash: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-kube-stack/namespace.yaml b/applications/base/services/opentelemetry-kube-stack/namespace.yaml new file mode 100644 index 0000000..bd5727d --- /dev/null +++ b/applications/base/services/opentelemetry-kube-stack/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: observability \ No newline at end of file diff --git a/applications/base/services/opentelemetry-kube-stack/source.yaml b/applications/base/services/opentelemetry-kube-stack/source.yaml new file mode 100644 index 0000000..9393c6b --- /dev/null +++ b/applications/base/services/opentelemetry-kube-stack/source.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: opentelemetry +spec: + url: https://open-telemetry.github.io/opentelemetry-helm-charts + interval: 1h \ No newline at end of file From 9a190eb8784ef2f2616c5b7196932ecc9d10b02f Mon Sep 17 00:00:00 2001 From: Victor Palma Date: Wed, 29 Oct 2025 18:27:51 -0500 Subject: [PATCH 02/10] feat: add OpenTelemetry operator service - Add OpenTelemetry operator v0.98.0 to observability namespace - Include security-hardened configuration with: - High availability (2 replicas with PDB) - Non-root execution and read-only root filesystem - Resource limits and security contexts - cert-manager integration for webhook TLS - Prometheus monitoring enabled - Follow openCenter GitOps standards and patterns - Update main README.md with service documentation - Add comprehensive service-specific README Resolves: OpenTelemetry operator deployment for auto-instrumentation --- README.md | 12 ++ .../services/opentelemetry-operator/README.md | 119 +++++++++++++ .../helm-values/hardened-values-v0.98.0.yaml | 163 ++++++++++++++++++ .../opentelemetry-operator/helmrelease.yaml | 37 ++++ .../opentelemetry-operator/kustomization.yaml | 14 ++ .../opentelemetry-operator/namespace.yaml | 5 + .../opentelemetry-operator/source.yaml | 8 + docs/opentelemetry.md | 49 ++++++ 8 files changed, 407 insertions(+) create mode 100644 applications/base/services/opentelemetry-operator/README.md create mode 100644 applications/base/services/opentelemetry-operator/helm-values/hardened-values-v0.98.0.yaml create mode 100644 applications/base/services/opentelemetry-operator/helmrelease.yaml create mode 100644 applications/base/services/opentelemetry-operator/kustomization.yaml create mode 100644 applications/base/services/opentelemetry-operator/namespace.yaml create mode 100644 applications/base/services/opentelemetry-operator/source.yaml create mode 100644 docs/opentelemetry.md diff --git a/README.md b/README.md index 930cb79..0c21398 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ applications/ | **kube-prometheus-stack** | Core Service | `observability` | Complete monitoring and alerting stack | | **metallb** | Core Service | `metallb-system` | Bare metal load balancer | | **olm** | Core Service | `olm` | Operator Lifecycle Manager | +| **opentelemetry-operator** | Core Service | `observability` | OpenTelemetry operator for auto-instrumentation | | **sealed-secrets** | Core Service | `sealed-secrets` | Encrypted secrets management | | **velero** | Core Service | `velero` | Cluster backup and disaster recovery | | **alert-proxy** | Managed Service | `rackspace` | Rackspace alert aggregation | @@ -103,6 +104,17 @@ applications/ - Dependency resolution - Automatic updates +#### **opentelemetry-operator** +- **Purpose**: OpenTelemetry operator for auto-instrumentation and collector management +- **Source**: OpenTelemetry Helm repository (`https://open-telemetry.github.io/opentelemetry-helm-charts`) +- **Namespace**: `observability` +- **Features**: + - Automatic OpenTelemetry instrumentation injection + - OpenTelemetry Collector deployment and management + - Custom resource definitions for OpenTelemetry configuration + - Webhook-based sidecar injection + - Multi-language auto-instrumentation support (Java, Node.js, Python, .NET, Go) + #### **sealed-secrets** - **Purpose**: Encrypted secrets management - **Namespace**: `sealed-secrets` diff --git a/applications/base/services/opentelemetry-operator/README.md b/applications/base/services/opentelemetry-operator/README.md new file mode 100644 index 0000000..a73f16a --- /dev/null +++ b/applications/base/services/opentelemetry-operator/README.md @@ -0,0 +1,119 @@ +# OpenTelemetry Operator + +The OpenTelemetry Operator is a Kubernetes operator that manages OpenTelemetry Collector instances and auto-instrumentation of workloads using OpenTelemetry instrumentation libraries. + +## Overview + +This service deploys the OpenTelemetry Operator in the `observability` namespace with security hardening and high availability configuration. + +## Features + +- **Auto-instrumentation**: Automatically inject OpenTelemetry instrumentation into applications +- **Collector Management**: Deploy and manage OpenTelemetry Collector instances +- **CRD Management**: Provides custom resources for OpenTelemetry configuration +- **Webhook Support**: Admission webhooks for sidecar injection and validation + +## Configuration + +### Chart Information +- **Chart**: `opentelemetry/opentelemetry-operator` +- **Version**: `0.98.0` +- **App Version**: `0.137.0` +- **Repository**: https://open-telemetry.github.io/opentelemetry-helm-charts + +### Security Hardening + +The deployment includes the following security measures: + +- **Non-root execution**: All containers run as non-root user (65532) +- **Read-only root filesystem**: Containers use read-only root filesystems +- **Dropped capabilities**: All Linux capabilities are dropped +- **Security profiles**: Uses RuntimeDefault seccomp profile +- **Resource limits**: CPU and memory limits configured for all containers + +### High Availability + +- **Replica count**: 2 replicas for high availability +- **Pod Disruption Budget**: Ensures minimum availability during disruptions +- **Leader election**: Enabled to prevent split-brain scenarios +- **Anti-affinity**: Distributes pods across nodes (when configured) + +### Monitoring + +- **ServiceMonitor**: Enabled for Prometheus metrics collection +- **Metrics endpoint**: Exposes metrics on port 8080 +- **Health checks**: Readiness and liveness probes configured + +## Custom Resources + +The operator provides the following custom resources: + +- **OpenTelemetryCollector**: Manages collector deployments +- **Instrumentation**: Configures auto-instrumentation for applications +- **OpAMPBridge**: Manages OpAMP bridge instances + +## Dependencies + +- **cert-manager**: Required for TLS certificate management of admission webhooks +- **Prometheus Operator**: Optional, for ServiceMonitor support + +## Usage + +After deployment, you can create OpenTelemetry resources: + +```yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector + namespace: observability +spec: + config: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + processors: + batch: + exporters: + logging: + loglevel: debug + service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [logging] +``` + +## Troubleshooting + +### Common Issues + +1. **Webhook failures**: Ensure cert-manager is deployed and healthy +2. **CRD conflicts**: Check for existing OpenTelemetry CRDs if upgrading +3. **RBAC issues**: Verify cluster-admin permissions during installation + +### Useful Commands + +```bash +# Check operator status +kubectl get pods -n observability -l app.kubernetes.io/name=opentelemetry-operator + +# View operator logs +kubectl logs -n observability -l app.kubernetes.io/name=opentelemetry-operator + +# Check CRDs +kubectl get crd | grep opentelemetry + +# Verify webhooks +kubectl get validatingwebhookconfiguration | grep opentelemetry +kubectl get mutatingwebhookconfiguration | grep opentelemetry +``` + +## References + +- [OpenTelemetry Operator Documentation](https://opentelemetry.io/docs/kubernetes/operator/) +- [Helm Chart Repository](https://github.com/open-telemetry/opentelemetry-helm-charts) +- [OpenTelemetry Specification](https://opentelemetry.io/docs/specs/) \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/helm-values/hardened-values-v0.98.0.yaml b/applications/base/services/opentelemetry-operator/helm-values/hardened-values-v0.98.0.yaml new file mode 100644 index 0000000..bd5ef1a --- /dev/null +++ b/applications/base/services/opentelemetry-operator/helm-values/hardened-values-v0.98.0.yaml @@ -0,0 +1,163 @@ +# Hardened values for OpenTelemetry Operator v0.98.0 +# Security-focused configuration following openCenter standards + +# High availability configuration +replicaCount: 2 + +# Pod Disruption Budget for high availability +pdb: + create: true + minAvailable: 1 + +# Manager configuration with security hardening +manager: + image: + repository: ghcr.io/open-telemetry/opentelemetry-operator/opentelemetry-operator + imagePullPolicy: IfNotPresent + + # Collector image configuration + collectorImage: + repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-k8s + tag: 0.137.0 + + # Resource limits for production workloads + resources: + limits: + cpu: 200m + memory: 256Mi + ephemeral-storage: 100Mi + requests: + cpu: 100m + memory: 128Mi + ephemeral-storage: 50Mi + + # Environment variables + env: + ENABLE_WEBHOOKS: "true" + + # ServiceAccount configuration + serviceAccount: + create: true + annotations: {} + + # Enable Prometheus monitoring + serviceMonitor: + enabled: true + extraLabels: {} + annotations: {} + metricsEndpoints: + - port: metrics + + # Enable leader election for HA + leaderElection: + enabled: true + + # Security context - hardened configuration + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + +# Kube RBAC Proxy configuration +kubeRBACProxy: + enabled: true + image: + repository: quay.io/brancz/kube-rbac-proxy + tag: v0.19.1 + + # Resource limits + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + + # Security context - hardened configuration + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + +# Admission webhooks configuration +admissionWebhooks: + create: true + servicePort: 443 + failurePolicy: Fail + + # Pod injection policy + pods: + failurePolicy: Ignore + + # Webhook timeout + timeoutSeconds: 10 + + # Use cert-manager for TLS certificates + certManager: + enabled: true + certificateAnnotations: {} + issuerAnnotations: {} + + # Disable auto-generated certificates since we use cert-manager + autoGenerateCert: + enabled: false + +# CRDs management +crds: + create: true + +# RBAC configuration +role: + create: true + +clusterRole: + create: true + +# Node scheduling - Linux nodes only +nodeSelector: + kubernetes.io/os: linux + +# Pod-level security context +securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + fsGroup: 65532 + +# Service account token mounting +automountServiceAccountToken: true + +# Test framework security hardening +testFramework: + image: + repository: busybox + tag: latest + + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/helmrelease.yaml b/applications/base/services/opentelemetry-operator/helmrelease.yaml new file mode 100644 index 0000000..21b3a43 --- /dev/null +++ b/applications/base/services/opentelemetry-operator/helmrelease.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: opentelemetry-operator + namespace: observability +spec: + releaseName: opentelemetry-operator + interval: 5m + timeout: 10m + driftDetection: + mode: enabled + install: + remediation: + retries: 3 + remediateLastFailure: true + upgrade: + remediation: + retries: 0 + remediateLastFailure: false + targetNamespace: observability + chart: + spec: + chart: opentelemetry-operator + version: 0.98.0 + sourceRef: + kind: HelmRepository + name: opentelemetry + namespace: observability + valuesFrom: + - kind: Secret + name: opentelemetry-operator-values-base + valuesKey: hardened.yaml + - kind: Secret + name: opentelemetry-operator-values-override + valuesKey: override.yaml + optional: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/kustomization.yaml b/applications/base/services/opentelemetry-operator/kustomization.yaml new file mode 100644 index 0000000..56c09a5 --- /dev/null +++ b/applications/base/services/opentelemetry-operator/kustomization.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - "namespace.yaml" + - "source.yaml" + - "helmrelease.yaml" +secretGenerator: + - name: opentelemetry-operator-values-base + type: Opaque + files: + - hardened.yaml=helm-values/hardened-values-v0.98.0.yaml + options: + disableNameSuffixHash: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/namespace.yaml b/applications/base/services/opentelemetry-operator/namespace.yaml new file mode 100644 index 0000000..bd5727d --- /dev/null +++ b/applications/base/services/opentelemetry-operator/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: observability \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/source.yaml b/applications/base/services/opentelemetry-operator/source.yaml new file mode 100644 index 0000000..9393c6b --- /dev/null +++ b/applications/base/services/opentelemetry-operator/source.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: opentelemetry +spec: + url: https://open-telemetry.github.io/opentelemetry-helm-charts + interval: 1h \ No newline at end of file diff --git a/docs/opentelemetry.md b/docs/opentelemetry.md new file mode 100644 index 0000000..276274f --- /dev/null +++ b/docs/opentelemetry.md @@ -0,0 +1,49 @@ +┌─────────────────────────────────────────────────────────────┐ +│ Application Layer │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ Apps │ │ Apps │ │ Apps │ │ Apps │ │ +│ │ (OTEL │ │ (OTEL │ │ (OTEL │ │ (OTEL │ │ +│ │ SDK) │ │ SDK) │ │ SDK) │ │ SDK) │ │ +│ └─────┬────┘ └─────┬────┘ └─────┬────┘ └─────┬────┘ │ +│ │ │ │ │ │ +└────────┼──────────────┼──────────────┼──────────────┼───────┘ + │ │ │ │ + ┌────▼──────────────▼──────────────▼──────────────▼──────┐ + │ OpenTelemetry Collector (DaemonSet) │ + │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ + │ │ Traces │ │ Metrics │ │ Logs │ │ + │ │Receiver │ │Receiver │ │Receiver │ │ + │ └────┬────┘ └────┬────┘ └────┬────┘ │ + │ │ │ │ │ + │ ┌────▼────────────▼────────────▼────┐ │ + │ │ Processing Pipeline │ │ + │ └────┬────────────┬────────────┬────┘ │ + │ │ │ │ │ + │ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ │ + │ │ Traces │ │ Metrics │ │ Logs │ │ + │ │Exporter │ │Exporter │ │Exporter │ │ + │ └─────────┘ └─────────┘ └─────────┘ │ + └─────────────┬────────────┬────────────┬────────────────┘ + │ │ │ + │ ┌────▼─────┐ │ + │ │Prometheus│ │ + │ │ Scraper │ │ + │ └────┬─────┘ │ + │ │ │ + ┌───────▼────────────▼────────────▼──────┐ + │ Storage Layer │ + │ ┌──────────┐ ┌────────────┐ ┌────────┐│ + │ │ Traces │ │ Metrics │ │ Logs ││ + │ │ Backend │ │(Prometheus │ │Backend ││ + │ │ │ │ TSDB) │ │ ││ + │ └──────────┘ └─────┬──────┘ └────────┘│ + └─────────────────────┼──────────────────┘ + │ + ┌─────▼──────┐ + │AlertManager│ + └─────┬──────┘ + │ + ┌─────▼───────┐ + │Visualization│ + │ (Grafana) │ + └─────────────┘ From cbc99f0ca69f3e21cde9722f34b24554cf17db2c Mon Sep 17 00:00:00 2001 From: Victor Palma Date: Wed, 29 Oct 2025 19:39:53 -0500 Subject: [PATCH 03/10] docs: update README.md to reflect opentelemetry-kube-stack migration - Replace opentelemetry-operator references with opentelemetry-kube-stack - Update service description to emphasize complete observability stack - Remove deprecated opentelemetry-operator service files - Enhance feature descriptions with Prometheus and Jaeger integration --- README.md | 17 +- .../services/opentelemetry-operator/README.md | 119 ------------- .../helm-values/hardened-values-v0.98.0.yaml | 163 ------------------ .../opentelemetry-operator/helmrelease.yaml | 37 ---- .../opentelemetry-operator/kustomization.yaml | 14 -- .../opentelemetry-operator/namespace.yaml | 5 - .../opentelemetry-operator/source.yaml | 8 - 7 files changed, 9 insertions(+), 354 deletions(-) delete mode 100644 applications/base/services/opentelemetry-operator/README.md delete mode 100644 applications/base/services/opentelemetry-operator/helm-values/hardened-values-v0.98.0.yaml delete mode 100644 applications/base/services/opentelemetry-operator/helmrelease.yaml delete mode 100644 applications/base/services/opentelemetry-operator/kustomization.yaml delete mode 100644 applications/base/services/opentelemetry-operator/namespace.yaml delete mode 100644 applications/base/services/opentelemetry-operator/source.yaml diff --git a/README.md b/README.md index 0c21398..55387ab 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ applications/ | **kube-prometheus-stack** | Core Service | `observability` | Complete monitoring and alerting stack | | **metallb** | Core Service | `metallb-system` | Bare metal load balancer | | **olm** | Core Service | `olm` | Operator Lifecycle Manager | -| **opentelemetry-operator** | Core Service | `observability` | OpenTelemetry operator for auto-instrumentation | +| **opentelemetry-kube-stack** | Core Service | `observability` | Complete OpenTelemetry observability stack | | **sealed-secrets** | Core Service | `sealed-secrets` | Encrypted secrets management | | **velero** | Core Service | `velero` | Cluster backup and disaster recovery | | **alert-proxy** | Managed Service | `rackspace` | Rackspace alert aggregation | @@ -104,16 +104,17 @@ applications/ - Dependency resolution - Automatic updates -#### **opentelemetry-operator** -- **Purpose**: OpenTelemetry operator for auto-instrumentation and collector management -- **Source**: OpenTelemetry Helm repository (`https://open-telemetry.github.io/opentelemetry-helm-charts`) +#### **opentelemetry-kube-stack** +- **Purpose**: Complete OpenTelemetry observability stack for Kubernetes +- **Source**: OpenTelemetry Kube Stack Helm repository (`https://charts.opentelemetry.io`) - **Namespace**: `observability` - **Features**: - - Automatic OpenTelemetry instrumentation injection - - OpenTelemetry Collector deployment and management - - Custom resource definitions for OpenTelemetry configuration - - Webhook-based sidecar injection + - OpenTelemetry Operator for auto-instrumentation and collector management + - Pre-configured OpenTelemetry Collector for metrics, traces, and logs + - Automatic service discovery and monitoring - Multi-language auto-instrumentation support (Java, Node.js, Python, .NET, Go) + - Integration with Prometheus and Jaeger for complete observability + - Custom resource definitions for OpenTelemetry configuration #### **sealed-secrets** - **Purpose**: Encrypted secrets management diff --git a/applications/base/services/opentelemetry-operator/README.md b/applications/base/services/opentelemetry-operator/README.md deleted file mode 100644 index a73f16a..0000000 --- a/applications/base/services/opentelemetry-operator/README.md +++ /dev/null @@ -1,119 +0,0 @@ -# OpenTelemetry Operator - -The OpenTelemetry Operator is a Kubernetes operator that manages OpenTelemetry Collector instances and auto-instrumentation of workloads using OpenTelemetry instrumentation libraries. - -## Overview - -This service deploys the OpenTelemetry Operator in the `observability` namespace with security hardening and high availability configuration. - -## Features - -- **Auto-instrumentation**: Automatically inject OpenTelemetry instrumentation into applications -- **Collector Management**: Deploy and manage OpenTelemetry Collector instances -- **CRD Management**: Provides custom resources for OpenTelemetry configuration -- **Webhook Support**: Admission webhooks for sidecar injection and validation - -## Configuration - -### Chart Information -- **Chart**: `opentelemetry/opentelemetry-operator` -- **Version**: `0.98.0` -- **App Version**: `0.137.0` -- **Repository**: https://open-telemetry.github.io/opentelemetry-helm-charts - -### Security Hardening - -The deployment includes the following security measures: - -- **Non-root execution**: All containers run as non-root user (65532) -- **Read-only root filesystem**: Containers use read-only root filesystems -- **Dropped capabilities**: All Linux capabilities are dropped -- **Security profiles**: Uses RuntimeDefault seccomp profile -- **Resource limits**: CPU and memory limits configured for all containers - -### High Availability - -- **Replica count**: 2 replicas for high availability -- **Pod Disruption Budget**: Ensures minimum availability during disruptions -- **Leader election**: Enabled to prevent split-brain scenarios -- **Anti-affinity**: Distributes pods across nodes (when configured) - -### Monitoring - -- **ServiceMonitor**: Enabled for Prometheus metrics collection -- **Metrics endpoint**: Exposes metrics on port 8080 -- **Health checks**: Readiness and liveness probes configured - -## Custom Resources - -The operator provides the following custom resources: - -- **OpenTelemetryCollector**: Manages collector deployments -- **Instrumentation**: Configures auto-instrumentation for applications -- **OpAMPBridge**: Manages OpAMP bridge instances - -## Dependencies - -- **cert-manager**: Required for TLS certificate management of admission webhooks -- **Prometheus Operator**: Optional, for ServiceMonitor support - -## Usage - -After deployment, you can create OpenTelemetry resources: - -```yaml -apiVersion: opentelemetry.io/v1alpha1 -kind: OpenTelemetryCollector -metadata: - name: otel-collector - namespace: observability -spec: - config: | - receivers: - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - processors: - batch: - exporters: - logging: - loglevel: debug - service: - pipelines: - traces: - receivers: [otlp] - processors: [batch] - exporters: [logging] -``` - -## Troubleshooting - -### Common Issues - -1. **Webhook failures**: Ensure cert-manager is deployed and healthy -2. **CRD conflicts**: Check for existing OpenTelemetry CRDs if upgrading -3. **RBAC issues**: Verify cluster-admin permissions during installation - -### Useful Commands - -```bash -# Check operator status -kubectl get pods -n observability -l app.kubernetes.io/name=opentelemetry-operator - -# View operator logs -kubectl logs -n observability -l app.kubernetes.io/name=opentelemetry-operator - -# Check CRDs -kubectl get crd | grep opentelemetry - -# Verify webhooks -kubectl get validatingwebhookconfiguration | grep opentelemetry -kubectl get mutatingwebhookconfiguration | grep opentelemetry -``` - -## References - -- [OpenTelemetry Operator Documentation](https://opentelemetry.io/docs/kubernetes/operator/) -- [Helm Chart Repository](https://github.com/open-telemetry/opentelemetry-helm-charts) -- [OpenTelemetry Specification](https://opentelemetry.io/docs/specs/) \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/helm-values/hardened-values-v0.98.0.yaml b/applications/base/services/opentelemetry-operator/helm-values/hardened-values-v0.98.0.yaml deleted file mode 100644 index bd5ef1a..0000000 --- a/applications/base/services/opentelemetry-operator/helm-values/hardened-values-v0.98.0.yaml +++ /dev/null @@ -1,163 +0,0 @@ -# Hardened values for OpenTelemetry Operator v0.98.0 -# Security-focused configuration following openCenter standards - -# High availability configuration -replicaCount: 2 - -# Pod Disruption Budget for high availability -pdb: - create: true - minAvailable: 1 - -# Manager configuration with security hardening -manager: - image: - repository: ghcr.io/open-telemetry/opentelemetry-operator/opentelemetry-operator - imagePullPolicy: IfNotPresent - - # Collector image configuration - collectorImage: - repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-k8s - tag: 0.137.0 - - # Resource limits for production workloads - resources: - limits: - cpu: 200m - memory: 256Mi - ephemeral-storage: 100Mi - requests: - cpu: 100m - memory: 128Mi - ephemeral-storage: 50Mi - - # Environment variables - env: - ENABLE_WEBHOOKS: "true" - - # ServiceAccount configuration - serviceAccount: - create: true - annotations: {} - - # Enable Prometheus monitoring - serviceMonitor: - enabled: true - extraLabels: {} - annotations: {} - metricsEndpoints: - - port: metrics - - # Enable leader election for HA - leaderElection: - enabled: true - - # Security context - hardened configuration - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - readOnlyRootFilesystem: true - seccompProfile: - type: RuntimeDefault - -# Kube RBAC Proxy configuration -kubeRBACProxy: - enabled: true - image: - repository: quay.io/brancz/kube-rbac-proxy - tag: v0.19.1 - - # Resource limits - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 10m - memory: 64Mi - - # Security context - hardened configuration - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - readOnlyRootFilesystem: true - seccompProfile: - type: RuntimeDefault - -# Admission webhooks configuration -admissionWebhooks: - create: true - servicePort: 443 - failurePolicy: Fail - - # Pod injection policy - pods: - failurePolicy: Ignore - - # Webhook timeout - timeoutSeconds: 10 - - # Use cert-manager for TLS certificates - certManager: - enabled: true - certificateAnnotations: {} - issuerAnnotations: {} - - # Disable auto-generated certificates since we use cert-manager - autoGenerateCert: - enabled: false - -# CRDs management -crds: - create: true - -# RBAC configuration -role: - create: true - -clusterRole: - create: true - -# Node scheduling - Linux nodes only -nodeSelector: - kubernetes.io/os: linux - -# Pod-level security context -securityContext: - runAsGroup: 65532 - runAsNonRoot: true - runAsUser: 65532 - fsGroup: 65532 - -# Service account token mounting -automountServiceAccountToken: true - -# Test framework security hardening -testFramework: - image: - repository: busybox - tag: latest - - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - readOnlyRootFilesystem: true - seccompProfile: - type: RuntimeDefault - - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 10m - memory: 64Mi \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/helmrelease.yaml b/applications/base/services/opentelemetry-operator/helmrelease.yaml deleted file mode 100644 index 21b3a43..0000000 --- a/applications/base/services/opentelemetry-operator/helmrelease.yaml +++ /dev/null @@ -1,37 +0,0 @@ ---- -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: opentelemetry-operator - namespace: observability -spec: - releaseName: opentelemetry-operator - interval: 5m - timeout: 10m - driftDetection: - mode: enabled - install: - remediation: - retries: 3 - remediateLastFailure: true - upgrade: - remediation: - retries: 0 - remediateLastFailure: false - targetNamespace: observability - chart: - spec: - chart: opentelemetry-operator - version: 0.98.0 - sourceRef: - kind: HelmRepository - name: opentelemetry - namespace: observability - valuesFrom: - - kind: Secret - name: opentelemetry-operator-values-base - valuesKey: hardened.yaml - - kind: Secret - name: opentelemetry-operator-values-override - valuesKey: override.yaml - optional: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/kustomization.yaml b/applications/base/services/opentelemetry-operator/kustomization.yaml deleted file mode 100644 index 56c09a5..0000000 --- a/applications/base/services/opentelemetry-operator/kustomization.yaml +++ /dev/null @@ -1,14 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - "namespace.yaml" - - "source.yaml" - - "helmrelease.yaml" -secretGenerator: - - name: opentelemetry-operator-values-base - type: Opaque - files: - - hardened.yaml=helm-values/hardened-values-v0.98.0.yaml - options: - disableNameSuffixHash: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/namespace.yaml b/applications/base/services/opentelemetry-operator/namespace.yaml deleted file mode 100644 index bd5727d..0000000 --- a/applications/base/services/opentelemetry-operator/namespace.yaml +++ /dev/null @@ -1,5 +0,0 @@ ---- -apiVersion: v1 -kind: Namespace -metadata: - name: observability \ No newline at end of file diff --git a/applications/base/services/opentelemetry-operator/source.yaml b/applications/base/services/opentelemetry-operator/source.yaml deleted file mode 100644 index 9393c6b..0000000 --- a/applications/base/services/opentelemetry-operator/source.yaml +++ /dev/null @@ -1,8 +0,0 @@ ---- -apiVersion: source.toolkit.fluxcd.io/v1 -kind: HelmRepository -metadata: - name: opentelemetry -spec: - url: https://open-telemetry.github.io/opentelemetry-helm-charts - interval: 1h \ No newline at end of file From 40fb9b418abed33d2a83e7441887ee48453efd97 Mon Sep 17 00:00:00 2001 From: Pratik Bandarkar Date: Thu, 30 Oct 2025 16:42:44 +0000 Subject: [PATCH 04/10] fix: Fix NS labels and values --- .../hardened-values-v0.11.1 copy.yaml | 233 ++ .../helm-values/hardened-values-v0.11.1.yaml | 1906 +++++++++++++++-- .../opentelemetry-kube-stack/namespace.yaml | 7 +- 3 files changed, 1952 insertions(+), 194 deletions(-) create mode 100644 applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml diff --git a/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml b/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml new file mode 100644 index 0000000..1c11fdd --- /dev/null +++ b/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml @@ -0,0 +1,233 @@ +# Security configurations for OpenTelemetry Kube Stack +# Version: 0.11.1 + +# Cluster name for identification +clusterName: "openCenter-cluster" + +# OpenTelemetry Operator configuration +opentelemetry-operator: + enabled: true + manager: + # Security context for operator manager + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + # Resource limits for operator + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "500m" + # Admission webhooks configuration + admissionWebhooks: + failurePolicy: "Ignore" + # Security context for webhooks + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + +# Default collector configuration with security hardening +defaultCRConfig: + enabled: true + mode: deployment + replicas: 2 + + # Security contexts + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + + podSecurityContext: + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + seccompProfile: + type: RuntimeDefault + + # Container security context + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + + # Resource limits + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + + # Node selector for Linux nodes + nodeSelector: + kubernetes.io/os: linux + + # Basic OTLP configuration + config: + receivers: + otlp: + protocols: + grpc: + endpoint: ${env:MY_POD_IP}:4317 + http: + endpoint: ${env:MY_POD_IP}:4318 + processors: + batch: + timeout: 1s + send_batch_size: 1024 + memory_limiter: + limit_mib: 400 + spike_limit_mib: 100 + check_interval: 5s + exporters: + logging: + loglevel: info + service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [logging] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [logging] + logs: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [logging] + +# Kube State Metrics configuration +kubeStateMetrics: + enabled: true + +kube-state-metrics: + # Security context + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + # Resource limits + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "200m" + # Node selector + nodeSelector: + kubernetes.io/os: linux + # Prometheus monitoring + prometheus: + monitor: + enabled: true + honorLabels: true + +# Node Exporter configuration +nodeExporter: + enabled: true + +prometheus-node-exporter: + # Security context + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + # Resource limits + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "200m" + # Node selector + nodeSelector: + kubernetes.io/os: linux + # Prometheus monitoring + prometheus: + monitor: + enabled: true + jobLabel: node-exporter + +# Kubernetes service monitors (disabled to avoid conflicts with existing monitoring) +kubernetesServiceMonitors: + enabled: false + +# Individual component monitors (disabled to avoid conflicts) +kubeApiServer: + enabled: false +kubelet: + enabled: false +kubeControllerManager: + enabled: false +coreDns: + enabled: false +kubeEtcd: + enabled: false +kubeScheduler: + enabled: false +kubeProxy: + enabled: false + +# CRDs installation +crds: + installOtel: true + installPrometheus: false # Disabled to avoid conflicts with existing Prometheus stack + +# Cleanup job configuration +cleanupJob: + enabled: true + image: + repository: rancher/kubectl + tag: v1.34.1 + # Security context for cleanup job + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml b/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml index 1c11fdd..cfd2b81 100644 --- a/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml +++ b/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml @@ -1,233 +1,1753 @@ -# Security configurations for OpenTelemetry Kube Stack -# Version: 0.11.1 +# Top level field indicating an override for fullname +fullnameOverride: "" +# Top level field indicating an override for the namespace +namespaceOverride: "" -# Cluster name for identification -clusterName: "openCenter-cluster" +# Top level field specifying the name of the cluster +clusterName: "" -# OpenTelemetry Operator configuration +# Extra environment variables to add to each collector, bridge and instrumentation +extraEnvs: [] + +# Enables a cleanup job to make sure the CRs are uninstalled before the operator +cleanupJob: + # It is recommended to always keep this enabled so that running helm uninstall works properly. + # For non-helm installations i.e. ones created via helm template, it may make sense to disable this. + # For those installations, ensure that uninstallation for the operator happens _after_ the deletion of the CRs. + enabled: true + # Image details for the kubectl + image: + repository: rancher/kubectl + tag: v1.34.1 + # When digest is set to a non-empty value, images will be pulled by digest (regardless of tag value). + digest: "" + # To use the existingServiceAccount + existingServiceAccount: "" + +# Should the CRDs be installed by this chart. +crds: + # Control whether the opentelemetry.io CRDS should be installed. + installOtel: true + # Control whether the monitoring.coreos CRDS should be installed. + installPrometheus: true + +# Top level field related to the OpenTelemetry Operator opentelemetry-operator: + # Field indicating whether the operator is enabled or not enabled: true manager: - # Security context for operator manager - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - # Resource limits for operator - resources: - requests: - memory: "128Mi" - cpu: "100m" - limits: - memory: "256Mi" - cpu: "500m" - # Admission webhooks configuration + collectorImage: + repository: otel/opentelemetry-collector-k8s + # Sub-field for admission webhooks configuration admissionWebhooks: + # Policy for handling failures + # Setting this allows for an installation of the otel operator at the same time as the custom resources it manages. failurePolicy: "Ignore" - # Security context for webhooks - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - -# Default collector configuration with security hardening + + # This is disabled by default, as doing so creates a race condition with helm. + # https://github.com/open-telemetry/opentelemetry-helm-charts/issues/677 + # Users of this chart should _never_ set this to be true. If a user wishes + # to install the CRDs through the opentelemetry-operator chart, it is recommended + # to install the opentelemetry-operator chart separately and prior to the installation + # of this chart. + crds: + create: false + +# This is the default configuration for all collectors generated by the chart. +# Any collectors in the `collectors` are overlayed on top of this configuration. defaultCRConfig: - enabled: true + enabled: false + + # Suffix for the collector pool, by default the release name is prepended + suffix: "collector" + + # fullnameOverride allows overriding the collector's name + fullnameOverride: "" + + # Annotations for the collector + annotations: {} + # io.opentelemetry.com/resource: hello + + # Labels for the collector + labels: {} + # app: otc + + # scrape_configs_file allows the user to load an external file into + # the collector's prometheus scrape_configs. This is added to assist users + # coming from the prometheus ecosystem by allowing users to simply copy and paste + # directly from prometheus into this file to use the same config. + scrape_configs_file: "" + + # Management state of the collector + managementState: managed + + # Configuration for cluster role binding + clusterRoleBinding: + enabled: true + clusterRoleName: "" + + # Number of replicas for the collector + # replicas: 1 + + # Mode of deployment for the collector mode: deployment - replicas: 2 - - # Security contexts - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - - podSecurityContext: - runAsNonRoot: true - runAsUser: 65534 - fsGroup: 65534 - seccompProfile: - type: RuntimeDefault - - # Container security context - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - - # Resource limits - resources: - requests: - memory: "128Mi" - cpu: "100m" - limits: - memory: "512Mi" - cpu: "500m" - - # Node selector for Linux nodes - nodeSelector: - kubernetes.io/os: linux - - # Basic OTLP configuration - config: - receivers: - otlp: - protocols: - grpc: - endpoint: ${env:MY_POD_IP}:4317 - http: - endpoint: ${env:MY_POD_IP}:4318 - processors: - batch: - timeout: 1s - send_batch_size: 1024 - memory_limiter: - limit_mib: 400 - spike_limit_mib: 100 - check_interval: 5s - exporters: - logging: - loglevel: info - service: - pipelines: - traces: - receivers: [otlp] - processors: [memory_limiter, batch] - exporters: [logging] - metrics: - receivers: [otlp] - processors: [memory_limiter, batch] - exporters: [logging] - logs: - receivers: [otlp] - processors: [memory_limiter, batch] - exporters: [logging] - -# Kube State Metrics configuration -kubeStateMetrics: - enabled: true -kube-state-metrics: - # Security context - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - # Resource limits + # Service account associated with the collector + serviceAccount: "" + + # Image details for the collector + image: + # If you want to use the core image `otel/opentelemetry-collector`, you also need to change `command.name` value to `otelcol`. + repository: otel/opentelemetry-collector-k8s + pullPolicy: IfNotPresent + # By default, the version set for the collector will match the version of the operator being run. + tag: "" + # When digest is set to a non-empty value, images will be pulled by digest (regardless of tag value). + digest: "" + + # Upgrade strategy for the collector + upgradeStrategy: automatic + + # Configuration options for the collector + config: {} + # receivers: + # otlp: + # protocols: + # grpc: + # endpoint: ${env:MY_POD_IP}:4317 + # http: + # endpoint: ${env:MY_POD_IP}:4318 + # exporters: + # otlp: + # endpoint: "otel-collector.default:4317" + # tls: + # insecure: true + # sending_queue: + # num_consumers: 4 + # queue_size: 100 + # retry_on_failure: + # enabled: true + # processors: + # batch: + # memory_limiter: + # # 80% of maximum memory up to 2G + # limit_mib: 400 + # # 25% of limit up to 2G + # spike_limit_mib: 100 + # check_interval: 5s + # extensions: + # zpages: {} + # service: + # extensions: [zpages] + # pipelines: + # traces: + # receivers: [otlp] + # processors: [memory_limiter, batch] + # exporters: [otlp] + + # Whether to use host network for the collector + hostNetwork: false + + # Whether to share process namespace for the collector + shareProcessNamespace: false + + # Priority class name for the collector + priorityClassName: "" + + # Termination grace period for the collector + terminationGracePeriodSeconds: 30 + + # Resource requests and limits for the collector resources: requests: memory: "64Mi" - cpu: "50m" + cpu: "250m" limits: memory: "128Mi" - cpu: "200m" - # Node selector - nodeSelector: - kubernetes.io/os: linux - # Prometheus monitoring - prometheus: - monitor: - enabled: true - honorLabels: true + cpu: "250m" -# Node Exporter configuration -nodeExporter: + # Node selector for the collector + nodeSelector: {} + # nodeType: worker + + # Arguments for the collector + args: {} + # arg1: value1 + # arg2: value2 + + # Autoscaler configuration for the collector + autoscaler: {} + # minReplicas: 1 + # maxReplicas: 10 + # targetCPUUtilization: 50 + + # Pod disruption budget for the collector + podDisruptionBudget: {} + # maxUnavailable: 1 + + # Security context for the collector + securityContext: {} + # runAsUser: 1000 + # capabilities: + # drop: + # - ALL + + # Pod security context for the collector + podSecurityContext: {} + # runAsUser: 1000 + + # Annotations for the collector's pods + podAnnotations: {} + # prometheus.io/scrape: "true" + + # Target allocator configuration + targetAllocator: {} + # replicas: 1 + # nodeSelector: + # nodeType: worker + # resources: + # requests: + # memory: "64Mi" + # cpu: "250m" + # limits: + # memory: "128Mi" + # cpu: "500m" + # allocationStrategy: consistent-hashing + # filterStrategy: relabel-config + # serviceAccount: my-service-account + # image: myregistry/myimage:latest + # enabled: true + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: kubernetes.io/e2e-az-name + # operator: In + # values: + # - e2e-az1 + # - e2e-az2 + # # Configuration for Prometheus Custom Resources + # prometheusCR: + # enabled: true + # scrapeInterval: 30s + # podMonitorSelector: + # key1: value1 + # key2: value2 + # serviceMonitorSelector: + # key1: value1 + # key2: value2 + # probeSelector: + # key1: value1 + # key2: value2 + # scrapeConfigSelector: + # key1: value1 + # key2: value2 + # # List of namespaces to allow for scraping + # allowNamespaces: + # - namespace-1 + # - namespace-2 + # # List of namespaces to exclude from scraping + # denyNamespaces: + # - namespace-3 + # - namespace-4 + # securityContext: + # runAsUser: 1000 + # capabilities: + # drop: + # - ALL + # podSecurityContext: + # runAsUser: 1000 + # # Topology spread constraints for the target allocator + # topologySpreadConstraints: + # - maxSkew: 1 + # topologyKey: kubernetes.io/hostname + # whenUnsatisfiable: DoNotSchedule + # # Tolerations for the collector + # tolerations: + # - key: "key" + # operator: "Equal" + # value: "value" + # effect: "NoSchedule" + # # Environment variables for the target allocator + # env: + # - name: ENV_VAR1 + # value: value1 + # - name: ENV_VAR2 + # value: value2 + # # Observability configuration for the target allocator + # observability: + # metrics: + # enableMetrics: true + + # Affinity configuration for the collector + affinity: {} + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: kubernetes.io/e2e-az-name + # operator: In + # values: + # - e2e-az1 + # - e2e-az2 + + # Lifecycle configuration for the collector + lifecycle: {} + # preStop: + # exec: + # command: + # [ + # "/bin/sh", + # "-c", + # "echo Hello from the preStop handler > /dev/termination-log", + # ] + + # Liveness probe configuration for the collector + livenessProbe: {} + # initialDelaySeconds: 3 + # periodSeconds: 5 + # timeoutSeconds: 2 + # failureThreshold: 5 + + # Observability configuration for the collector + observability: {} + # metrics: + # enableMetrics: true + + # NOTE: the updateStrategy value is deprecated. Use daemonSetUpdateStrategy instead. + updateStrategy: {} + # type: RollingUpdate + + # Update strategy for the DaemonSet collector + daemonSetUpdateStrategy: {} + # type: RollingUpdate + + # Update strategy for the Deployment collector + deploymentUpdateStrategy: {} + # type: RollingUpdate + + # Volume mounts for the collector + volumeMounts: [] + # - name: data + # mountPath: /data + + # Ports configuration for the collector + # The operator automatically calculates ports for known receivers and exporters + # Set any custom ports here. + ports: [] + # - name: http + # protocol: TCP + # port: 80 + # targetPort: 8080 + + # Environment variables for the collector + env: [] + # - name: ENV_VAR1 + # value: value1 + # - name: ENV_VAR2 + # value: value2 + + # Volume claim templates for the collector + volumeClaimTemplates: [] + # - metadata: + # name: storage + # spec: + # accessModes: ["ReadWriteOnce"] + # resources: + # requests: + # storage: 1Gi + + # Tolerations for the collector + tolerations: [] + # - key: "key" + # operator: "Equal" + # value: "value" + # effect: "NoSchedule" + + # Volumes for the collector + volumes: [] + # - name: config-volume + # configMap: + # name: config + + # Init containers for the collector + initContainers: [] + # - name: init-nginx + # image: nginx + + # Additional containers for the collector + additionalContainers: [] + # - name: additional-container + # image: busybox + + # Topology spread constraints for the collector + topologySpreadConstraints: [] + # - maxSkew: 1 + # topologyKey: kubernetes.io/hostname + # whenUnsatisfiable: DoNotSchedule + # labelSelector: + # matchLabels: + # app: my-app + + # Config maps for the collector + configmaps: [] + # - name: config + # mountPath: /etc/config + + # Handles basic configuration of components that + # also require k8s modifications to work correctly. + # .Values.config can be used to modify/add to a preset + # component configuration, but CANNOT be used to remove + # preset configuration. If you require removal of any + # sections of a preset configuration, you cannot use + # the preset. Instead, configure the component manually in + # .Values.config and use the other fields supplied in the + # values.yaml to configure k8s as necessary. + presets: + # Configures the collector to collect logs. + # Adds the filelog receiver to the logs pipeline + # and adds the necessary volumes and volume mounts. + # Best used with mode = daemonset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#filelog-receiver for details on the receiver. + logsCollection: + enabled: false + includeCollectorLogs: true + # Enabling this writes checkpoints in /var/lib/otelcol/ host directory. + # Note this changes collector's user to root, so that it can write to host directory. + storeCheckpoints: false + # The maximum bytes size of the recombined field. + # Once the size exceeds the limit, all received entries of the source will be combined and flushed. + maxRecombineLogSize: 102400 + # Configures the collector to collect host metrics. + # Adds the hostmetrics receiver to the metrics pipeline + # and adds the necessary volumes and volume mounts. + # Best used with mode = daemonset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#host-metrics-receiver for details on the receiver. + hostMetrics: + enabled: false + # Configures the Kubernetes Processor to add Kubernetes metadata. + # Adds the k8sattributes processor to all the pipelines + # and adds the necessary rules to ClusteRole. + # Best used with mode = daemonset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#kubernetes-attributes-processor for details on the receiver. + kubernetesAttributes: + enabled: false + # When enabled the processor will extra all labels for an associated pod and add them as resource attributes. + # The label's exact name will be the key. + extractAllPodLabels: false + # When enabled the processor will extra all annotations for an associated pod and add them as resource attributes. + # The annotation's exact name will be the key. + extractAllPodAnnotations: false + # Configures the collector to collect node, pod, and container metrics from the API server on a kubelet.. + # Adds the kubeletstats receiver to the metrics pipeline + # and adds the necessary rules to ClusteRole. + # Best used with mode = daemonset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#kubeletstats-receiver for details on the receiver. + kubeletMetrics: + enabled: false + # Configures the collector to collect kubernetes events. + # Adds the k8sobjects receiver to the logs pipeline + # and collects kubernetes events by default. + # Best used with mode = deployment or statefulset. + # MUST be used by a collector with a single replica. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#kubernetes-objects-receiver for details on the receiver. + kubernetesEvents: + enabled: false + # Configures the Kubernetes Cluster Receiver to collect cluster-level metrics. + # Adds the k8s_cluster receiver to the metrics pipeline + # and adds the necessary rules to ClusteRole. + # Best used with mode = deployment or statefulset. + # MUST be used by a collector with a single replica. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#kubernetes-cluster-receiver for details on the receiver. + clusterMetrics: + enabled: false + +# Collectors is a map of collector configurations of the form: +# collectors: +# collectorName: +# enabled: true +# name: "example" +# Each collector configuration is layered on top of the `defaultCRConfig`, overriding a default if set. +# This configuration allows for multiple layers of overrides for different clusters. For example, you could +# create a collector called test with an OTLP exporter in your values.yaml, and then override the endpoint's +# destination in a file called values-staging.yaml. +collectors: + daemon: + suffix: daemon + mode: daemonset + enabled: true + resources: + limits: + cpu: 200m + memory: 500Mi + requests: + cpu: 100m + memory: 250Mi + # A scrape config file to instruct the daemon collector to pull metrics from any matching targets on the same node with + # prometheus.io/scrape=true + # This config also scrapes a running node exporter and the kubelet CAdvisor metrics which aren't currently supported. + scrape_configs_file: "daemon_scrape_configs.yaml" + presets: + logsCollection: + enabled: true + kubeletMetrics: + enabled: true + hostMetrics: + enabled: true + kubernetesAttributes: + enabled: true + kubernetesEvents: + enabled: true + clusterMetrics: + enabled: true + config: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + processors: + resourcedetection/env: + detectors: [env, k8snode] + timeout: 2s + override: false + resource/hostname: + attributes: + - key: host.name + from_attribute: k8s.node.name + action: insert + batch: + send_batch_size: 1000 + timeout: 1s + send_batch_max_size: 1500 + exporters: + debug: {} + + service: + pipelines: + traces: + receivers: + - otlp + processors: + - resourcedetection/env + - resource/hostname + - batch + exporters: + - debug + metrics: + receivers: + - otlp + processors: + - resourcedetection/env + - resource/hostname + - batch + exporters: + - debug + logs: + receivers: + - otlp + processors: + - resourcedetection/env + - resource/hostname + - batch + exporters: + - debug + +# Cluster role configuration +clusterRole: + # Whether the cluster role is enabled or not enabled: true -prometheus-node-exporter: - # Security context - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - # Resource limits + # Annotations for the cluster role + annotations: {} + + # Rules for the cluster role + rules: [] + +# Instrumentation configuration +instrumentation: + # Whether instrumentation is enabled or not + enabled: false + labels: {} + annotations: {} + + # Exporter configuration + exporter: + # This is the default collector's service + # Upon creation of a tracing collector, edit this endpoint. + endpoint: http://collector-collector:4317 + + # Resource configuration + resource: + resourceAttributes: {} + # environment: dev + addK8sUIDAttributes: true + + # Propagators configuration + propagators: + - tracecontext + - baggage + - b3 + - b3multi + - jaeger + - xray + - ottrace + + # Sampler configuration + sampler: {} + # type: parentbased_always_on + # argument: "0.25" + + # Environment variables for instrumentation + env: [] + # - name: ENV_VAR1 + # value: value1 + # - name: ENV_VAR2 + # value: value2 + + # Java agent configuration + java: {} + # image: myregistry/java-agent:latest + # volumeLimitSize: 200Mi + # env: + # - name: JAVA_ENV_VAR + # value: java_value + # resources: + # requests: + # memory: "64Mi" + # cpu: "250m" + # limits: + # memory: "128Mi" + # cpu: "500m" + + # NodeJS agent configuration + nodejs: {} + # image: myregistry/nodejs-agent:latest + # volumeLimitSize: 200Mi + # env: + # - name: NODEJS_ENV_VAR + # value: nodejs_value + # resourceRequirements: + # requests: + # memory: "64Mi" + # cpu: "250m" + # limits: + # memory: "128Mi" + # cpu: "500m" + + # Python agent configuration + python: {} + # image: myregistry/python-agent:latest + # volumeLimitSize: 200Mi + # env: + # - name: PYTHON_ENV_VAR + # value: python_value + # # Required if endpoint is set to 4317. + # # Python autoinstrumentation uses http/proto by default + # # so data must be sent to 4318 instead of 4317. + # - name: OTEL_EXPORTER_OTLP_ENDPOINT + # value: http://otel-collector:4318 + # resourceRequirements: + # requests: + # memory: "64Mi" + # cpu: "250m" + # limits: + # memory: "128Mi" + # cpu: "500m" + + # .NET agent configuration + dotnet: {} + # image: myregistry/dotnet-agent:latest + # volumeLimitSize: 200Mi + # env: + # - name: DOTNET_ENV_VAR + # value: dotnet_value + # # Required if endpoint is set to 4317. + # # Dotnet autoinstrumentation uses http/proto by default + # # See https://github.com/open-telemetry/opentelemetry-dotnet-instrumentation/blob/888e2cd216c77d12e56b54ee91dafbc4e7452a52/docs/config.md#otlp + # - name: OTEL_EXPORTER_OTLP_ENDPOINT + # value: http://otel-collector:4318 + # resourceRequirements: + # requests: + # memory: "64Mi" + # cpu: "250m" + # limits: + # memory: "128Mi" + # cpu: "500m" + + # Go agent configuration + go: {} + # image: myregistry/go-agent:latest + # volumeLimitSize: 200Mi + # env: + # - name: GO_ENV_VAR + # value: go_value + # # Required if endpoint is set to 4317. + # # Dotnet autoinstrumentation uses http/proto by default + # # See https://github.com/open-telemetry/opentelemetry-dotnet-instrumentation/blob/888e2cd216c77d12e56b54ee91dafbc4e7452a52/docs/config.md#otlp + # - name: OTEL_EXPORTER_OTLP_ENDPOINT + # value: http://otel-collector:4318 + # resourceRequirements: + # requests: + # memory: "64Mi" + # cpu: "250m" + # limits: + # memory: "128Mi" + # cpu: "500m" + + # Apache HTTPd agent configuration + apacheHttpd: {} + # image: myregistry/apache-agent:latest + # volumeLimitSize: 200Mi + # env: + # - name: APACHE_ENV_VAR + # value: apache_value + # attrs: + # - name: ATTRIBUTE_VAR + # value: attribute_value + # version: "2.4" + # configPath: "/usr/local/apache2/conf" + # resourceRequirements: + # requests: + # memory: "64Mi" + # cpu: "250m" + # limits: + # memory: "128Mi" + # cpu: "500m" + + # NGINX agent configuration + nginx: {} + # image: myregistry/nginx-agent:latest + # volumeLimitSize: 200Mi + # env: + # - name: NGINX_ENV_VAR + # value: nginx_value + # attrs: + # - name: ATTRIBUTE_VAR + # value: attribute_value + # configFile: "/etc/nginx/nginx.conf" + # resourceRequirements: + # requests: + # memory: "64Mi" + # cpu: "250m" + # limits: + # memory: "128Mi" + # cpu: "500m" + +# OpAMP bridge configuration. The OpAMP Bridge is an OpenTelemetry component +# that enables enhanced configuration and health monitoring for OpenTelemetry collectors +# deployed in Kubernetes. The Bridge pulls collector CRDs from the Kubernetes cluster and +# reports their configuration and status to a remote OpAMP Server. The Bridge will only pull +# collectors labeled with either +# * opentelemetry.io/opamp-reporting: true +# * opentelemetry.io/opamp-managed: true +# You can learn more about the Bridge's design here: +# https://docs.google.com/document/d/1M8VLNe_sv1MIfu5bUR5OV_vrMBnAI7IJN-7-IAr37JY +opAMPBridge: + # Whether OpAMP bridge is enabled or not + enabled: false + + # Adds `opentelemetry.io/opamp-reporting: true` to all collectors + addReportingLabel: true + # Adds `opentelemetry.io/opamp-managed: true` to all collectors + addManagedLabel: false + + # Endpoint for OpAMP server + endpoint: http://opamp-server:8080 + + # Description for additional information like non_identifying_attributes + description: {} + + # Headers configuration for OpAMP bridge + headers: {} + # Authorization: Bearer your_access_token + # Custom-Header: Custom-Value + + # Capabilities of OpAMP bridge + # You can learn more about OpAMP's capabilities here: + # https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#agenttoservercapabilities + capabilities: + AcceptsOpAMPConnectionSettings: true + AcceptsOtherConnectionSettings: true + AcceptsRemoteConfig: true + AcceptsRestartCommand: true + ReportsEffectiveConfig: true + ReportsHealth: true + ReportsOwnLogs: true + ReportsOwnMetrics: true + ReportsOwnTraces: true + ReportsRemoteConfig: true + ReportsStatus: true + + # Components allowed for OpAMP bridge + componentsAllowed: {} + # receiver: + # - otlp + # - prometheus + # processor: + # - batch + # - memory_limiter + # exporter: + # - prometheusremotewrite + + # Resources configuration for OpAMP bridge resources: - requests: - memory: "64Mi" - cpu: "50m" limits: - memory: "128Mi" - cpu: "200m" - # Node selector - nodeSelector: - kubernetes.io/os: linux - # Prometheus monitoring - prometheus: - monitor: - enabled: true - jobLabel: node-exporter + cpu: "250m" + memory: "256Mi" + requests: + cpu: "250m" + memory: "256Mi" + + # Security context for OpAMP bridge + securityContext: + runAsNonRoot: true + runAsUser: 1000 + + # Pod security context for OpAMP bridge + podSecurityContext: + fsGroup: 1000 + + # Pod annotations for OpAMP bridge + podAnnotations: {} + # prometheus.io/scrape: "true" + # prometheus.io/port: "8080" + + # Service account for OpAMP bridge + serviceAccount: "" + + # Image for OpAMP bridge + image: + repository: ghcr.io/open-telemetry/opentelemetry-operator/operator-opamp-bridge + pullPolicy: IfNotPresent + # By default, the version set for the bridge will match the version of the operator being run. + tag: "" + # When digest is set to a non-empty value, images will be pulled by digest (regardless of tag value). + digest: "" + + # Upgrade strategy for OpAMP bridge + upgradeStrategy: automatic + + # Volume mounts for OpAMP bridge + volumeMounts: [] + # - name: data + # mountPath: /data + + # Ports configuration for OpAMP bridge + ports: [] + # - name: http + # port: 8080 + # protocol: TCP + + # Environment variables for OpAMP bridge + env: [] + # - name: ENVIRONMENT + # value: production + + # Environment variables from config map for OpAMP bridge + envFrom: [] + # - configMapRef: + # name: opamp-config -# Kubernetes service monitors (disabled to avoid conflicts with existing monitoring) + # Tolerations for OpAMP bridge + tolerations: [] + # - key: "opamp" + # operator: "Equal" + # value: "true" + # effect: "NoSchedule" + + # Volumes for OpAMP bridge + volumes: [] + # - name: data + # emptyDir: {} + + # Whether to use host network for OpAMP bridge + hostNetwork: false + + # Priority class name for OpAMP bridge + priorityClassName: "" + + # Affinity configuration for OpAMP bridge + affinity: {} + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: opamp + # operator: In + # values: + # - "true" + + # Topology spread constraints for OpAMP bridge + topologySpreadConstraints: [] + # - maxSkew: 1 + # topologyKey: "kubernetes.io/hostname" + # whenUnsatisfiable: "DoNotSchedule" + # labelSelector: + # matchLabels: + # opamp: "true" + + # Bridge cluster role configuration + # In order to function the bridge is given its default role to + # list and get pods and opentelemetry collectors + clusterRole: + # Whether the bridge cluster role is enabled or not + enabled: true + + # Annotations for the bridge cluster role + annotations: {} + + # Rules for the bridge cluster role + rules: [] + +############################ +# Prometheus Configuration # +# (optional) # +############################ +# This configuration sections allows for a direct replacement of the kube-prometheus-stack +# chart where the collector scrapes the same metrics as the default prometheus installation. + +## Flag to disable all the kubernetes component scrapers +## kubernetesServiceMonitors: enabled: false + ignoreNamespaceSelectors: false -# Individual component monitors (disabled to avoid conflicts) +## Component scraping the kube api server +## kubeApiServer: enabled: false + tlsConfig: + serverName: kubernetes + insecureSkipVerify: false + serviceMonitor: + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + + jobLabel: component + selector: + matchLabels: + component: apiserver + provider: kubernetes + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: + # Drop excessively noisy apiserver buckets. + - action: drop + regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50) + sourceLabels: + - __name__ + - le + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - sourceLabels: + # - __meta_kubernetes_namespace + # - __meta_kubernetes_service_name + # - __meta_kubernetes_endpoint_port_name + # action: keep + # regex: default;kubernetes;https + # - targetLabel: __address__ + # replacement: kubernetes.default.svc:443 + + ## Additional labels + ## + additionalLabels: {} + # foo: bar + +## Component scraping the kubelet and kubelet-hosted cAdvisor +## the configuration for this is currently only in kubelet_scrape_configs.yaml +## This is because kubelet doesn't have a service and can only be scraped manually. kubelet: enabled: false + namespace: kube-system + + serviceMonitor: + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## If true, Prometheus use (respect) labels provided by exporter. + ## + honorLabels: true + + ## If true, Prometheus ingests metrics with timestamp provided by exporter. If false, Prometheus ingests metrics with timestamp of scrape. + ## + honorTimestamps: true + + ## Enable scraping the kubelet over https. For requirements to enable this see + ## https://github.com/prometheus-operator/prometheus-operator/issues/926 + ## + https: true + + ## Enable scraping /metrics/cadvisor from kubelet's service + ## + cAdvisor: true + + ## Enable scraping /metrics/probes from kubelet's service + ## + probes: true + +## Component scraping the kube controller manager +## kubeControllerManager: enabled: false + + ## If your kube controller manager is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## If using kubeControllerManager.endpoints only the port and targetPort are used + ## + service: + enabled: true + ## If null or unset, the value is determined dynamically based on target Kubernetes version due to change + ## of default port in Kubernetes 1.22. + ## + port: null + targetPort: null + # selector: + # component: kube-controller-manager + + serviceMonitor: + enabled: true + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + + ## port: Name of the port the metrics will be scraped from + ## + port: http-metrics + + jobLabel: jobLabel + selector: {} + # matchLabels: + # component: kube-controller-manager + + ## Enable scraping kube-controller-manager over https. + ## Requires proper certs (not self-signed) and delegated authentication/authorization checks. + ## If null or unset, the value is determined dynamically based on target Kubernetes version. + ## + https: null + + # Skip TLS certificate validation when scraping + insecureSkipVerify: null + + # Name of the server to use when validating TLS certificate + serverName: null + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + + ## Additional labels + ## + additionalLabels: {} + # foo: bar + +## Component scraping coreDns. Use either this or kubeDns +## coreDns: enabled: false + endpoints: [] + service: + enabled: true + port: 9153 + targetPort: 9153 + # selector: + # k8s-app: kube-dns + serviceMonitor: + enabled: true + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + + ## port: Name of the port the metrics will be scraped from + ## + port: http-metrics + + jobLabel: jobLabel + selector: {} + # matchLabels: + # k8s-app: kube-dns + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + + ## Additional labels + ## + additionalLabels: {} + # foo: bar + +## Component scraping kubeDns. Use either this or coreDns +## +kubeDns: + enabled: false + service: + dnsmasq: + port: 10054 + targetPort: 10054 + skydns: + port: 10055 + targetPort: 10055 + # selector: + # k8s-app: kube-dns + serviceMonitor: + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + + jobLabel: jobLabel + selector: {} + # matchLabels: + # k8s-app: kube-dns + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + dnsmasqMetricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + dnsmasqRelabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + + ## Additional labels + ## + additionalLabels: {} + # foo: bar + +## Component scraping etcd +## kubeEtcd: enabled: false + + ## If your etcd is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used + ## + service: + enabled: true + port: 2381 + targetPort: 2381 + # selector: + # component: etcd + + ## Configure secure access to the etcd cluster by loading a secret into prometheus and + ## specifying security configuration below. For example, with a secret named etcd-client-cert + ## + ## serviceMonitor: + ## scheme: https + ## insecureSkipVerify: false + ## serverName: localhost + ## caFile: /etc/prometheus/secrets/etcd-client-cert/etcd-ca + ## certFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client + ## keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client-key + ## + serviceMonitor: + enabled: true + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + scheme: http + insecureSkipVerify: false + serverName: "" + caFile: "" + certFile: "" + keyFile: "" + + ## port: Name of the port the metrics will be scraped from + ## + port: http-metrics + + jobLabel: jobLabel + selector: {} + # matchLabels: + # component: etcd + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + + ## Additional labels + ## + additionalLabels: {} + # foo: bar + +## Component scraping kube scheduler +## kubeScheduler: enabled: false + + ## If your kube scheduler is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## If using kubeScheduler.endpoints only the port and targetPort are used + ## + service: + enabled: true + ## If null or unset, the value is determined dynamically based on target Kubernetes version due to change + ## of default port in Kubernetes 1.23. + ## + port: null + targetPort: null + # selector: + # component: kube-scheduler + + serviceMonitor: + enabled: true + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + ## Enable scraping kube-scheduler over https. + ## Requires proper certs (not self-signed) and delegated authentication/authorization checks. + ## If null or unset, the value is determined dynamically based on target Kubernetes version. + ## + https: null + + ## port: Name of the port the metrics will be scraped from + ## + port: http-metrics + + jobLabel: jobLabel + selector: {} + # matchLabels: + # component: kube-scheduler + + ## Skip TLS certificate validation when scraping + insecureSkipVerify: null + + ## Name of the server to use when validating TLS certificate + serverName: null + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + + ## Additional labels + ## + additionalLabels: {} + # foo: bar + +## Component scraping kube proxy +## kubeProxy: enabled: false -# CRDs installation -crds: - installOtel: true - installPrometheus: false # Disabled to avoid conflicts with existing Prometheus stack + ## If your kube proxy is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + service: + enabled: true + port: 10249 + targetPort: 10249 + # selector: + # k8s-app: kube-proxy + + serviceMonitor: + enabled: true + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + + ## port: Name of the port the metrics will be scraped from + ## + port: http-metrics + + jobLabel: jobLabel + selector: {} + # matchLabels: + # k8s-app: kube-proxy + + ## Enable scraping kube-proxy over https. + ## Requires proper certs (not self-signed) and delegated authentication/authorization checks + ## + https: false + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## Additional labels + ## + additionalLabels: {} + # foo: bar + +## Controls whether the kube-state-metrics chart should be created. +## This block matches the configuration for the kube-prometheus-stack chart for compatibility. +kubeStateMetrics: + enabled: false + +## Configuration for kube-state-metrics subchart +## The Kube-State-Metrics agent collects cluster-level metrics +## Read more here about the chart: +## https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics +kube-state-metrics: + namespaceOverride: "" + rbac: + create: true + releaseLabel: true + prometheus: + monitor: + enabled: true + + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## Scrape Timeout. If not set, the Prometheus default scrape timeout is used. + ## + scrapeTimeout: "" + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + + # Keep labels from scraped data, overriding server-side labels + ## + honorLabels: true + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + + selfMonitor: + enabled: false + +## Controls whether the prometheus-node-exporter chart should be created. +## This block matches the configuration for the kube-prometheus-stack chart for compatibility. +nodeExporter: + enabled: false + +## Configuration for prometheus-node-exporter subchart +## This will install a daemonset that pulls metric data from each node +## Read more here about the chart: +## https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter +prometheus-node-exporter: + namespaceOverride: "" + podLabels: + ## Add the 'node-exporter' label to be used by serviceMonitor to match standard common usage in rules and grafana dashboards + ## + jobLabel: node-exporter + releaseLabel: true + extraArgs: + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) + - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ + service: + portName: http-metrics + prometheus: + monitor: + enabled: true + + jobLabel: jobLabel + + ## Scrape interval. If not set, the Prometheus default scrape interval is used. + ## + interval: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + ## How long until a scrape request times out. If not set, the Prometheus default scape timeout is used. + ## + scrapeTimeout: "" + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + + ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + metricRelabelings: [] + # - sourceLabels: [__name__] + # separator: ; + # regex: ^node_mountstats_nfs_(event|operations|transport)_.+ + # replacement: $1 + # action: drop + + ## RelabelConfigs to apply to samples before scraping + ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig + ## + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + rbac: + ## If true, create PSPs for node-exporter + ## + pspEnabled: false + +## Array of extra manifests to deploy +## This will deploy arbitrary manifests as part of the helm relase +extraObjects: [] +# - apiVersion: secrets-store.csi.x-k8s.io/v1 +# kind: SecretProviderClass +# metadata: +# name: my-secret-provider +# spec: +# parameters: +# objects: > +# - secretPath: xxxxxxx/yyyy +# objectName: "imagePullSecret" +# secretKey: registry-credentials +# vaultAddress: xxxxxxx +# provider: vault +# secretObjects: +# - data: +# - key: .dockerconfigjson +# objectName: imagePullSecret +# secretName: demo-image-pull-secrets +# type: kubernetes.io/dockerconfigjson -# Cleanup job configuration -cleanupJob: - enabled: true - image: - repository: rancher/kubectl - tag: v1.34.1 - # Security context for cleanup job - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true \ No newline at end of file diff --git a/applications/base/services/opentelemetry-kube-stack/namespace.yaml b/applications/base/services/opentelemetry-kube-stack/namespace.yaml index bd5727d..09f0d46 100644 --- a/applications/base/services/opentelemetry-kube-stack/namespace.yaml +++ b/applications/base/services/opentelemetry-kube-stack/namespace.yaml @@ -2,4 +2,9 @@ apiVersion: v1 kind: Namespace metadata: - name: observability \ No newline at end of file + name: observability + labels: + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/enforce-version: latest + pod-security.kubernetes.io/warn: baseline + pod-security.kubernetes.io/audit: baseline From 564dca2a48a496733bb4eecc47b276adad8354b9 Mon Sep 17 00:00:00 2001 From: Pratik Bandarkar Date: Fri, 31 Oct 2025 15:15:55 +0000 Subject: [PATCH 05/10] fix: refactor observability to avoid namespace deletion --- .../kube-prometheus-stack/README.md | 0 .../helm-values/alerting-rules-overrides.yaml | 0 .../helm-values/alertmanager-overrides.yaml | 0 .../helm-values/hardened-values-v0.0.1.yaml | 0 .../helm-values/prometheus-overrides.yaml | 0 .../kube-prometheus-stack/helmrelease.yaml | 0 .../kube-prometheus-stack/kustomization.yaml | 1 - .../kube-prometheus-stack/source.yaml | 0 .../namespace}/namespace.yaml | 0 .../opentelemetry-kube-stack/README.md | 0 .../helm-values/hardened-values-v0.11.1 copy.yaml | 0 .../helm-values/hardened-values-v0.11.1.yaml | 0 .../opentelemetry-kube-stack/helmrelease.yaml | 0 .../opentelemetry-kube-stack/kustomization.yaml | 3 +-- .../opentelemetry-kube-stack/source.yaml | 0 .../services/opentelemetry-kube-stack/namespace.yaml | 10 ---------- 16 files changed, 1 insertion(+), 13 deletions(-) rename applications/base/services/{ => observability}/kube-prometheus-stack/README.md (100%) rename applications/base/services/{ => observability}/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml (100%) rename applications/base/services/{ => observability}/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml (100%) rename applications/base/services/{ => observability}/kube-prometheus-stack/helm-values/hardened-values-v0.0.1.yaml (100%) rename applications/base/services/{ => observability}/kube-prometheus-stack/helm-values/prometheus-overrides.yaml (100%) rename applications/base/services/{ => observability}/kube-prometheus-stack/helmrelease.yaml (100%) rename applications/base/services/{ => observability}/kube-prometheus-stack/kustomization.yaml (95%) rename applications/base/services/{ => observability}/kube-prometheus-stack/source.yaml (100%) rename applications/base/services/{kube-prometheus-stack => observability/namespace}/namespace.yaml (100%) rename applications/base/services/{ => observability}/opentelemetry-kube-stack/README.md (100%) rename applications/base/services/{ => observability}/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml (100%) rename applications/base/services/{ => observability}/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml (100%) rename applications/base/services/{ => observability}/opentelemetry-kube-stack/helmrelease.yaml (100%) rename applications/base/services/{ => observability}/opentelemetry-kube-stack/kustomization.yaml (83%) rename applications/base/services/{ => observability}/opentelemetry-kube-stack/source.yaml (100%) delete mode 100644 applications/base/services/opentelemetry-kube-stack/namespace.yaml diff --git a/applications/base/services/kube-prometheus-stack/README.md b/applications/base/services/observability/kube-prometheus-stack/README.md similarity index 100% rename from applications/base/services/kube-prometheus-stack/README.md rename to applications/base/services/observability/kube-prometheus-stack/README.md diff --git a/applications/base/services/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml b/applications/base/services/observability/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml similarity index 100% rename from applications/base/services/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml rename to applications/base/services/observability/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml diff --git a/applications/base/services/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml b/applications/base/services/observability/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml similarity index 100% rename from applications/base/services/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml rename to applications/base/services/observability/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml diff --git a/applications/base/services/kube-prometheus-stack/helm-values/hardened-values-v0.0.1.yaml b/applications/base/services/observability/kube-prometheus-stack/helm-values/hardened-values-v0.0.1.yaml similarity index 100% rename from applications/base/services/kube-prometheus-stack/helm-values/hardened-values-v0.0.1.yaml rename to applications/base/services/observability/kube-prometheus-stack/helm-values/hardened-values-v0.0.1.yaml diff --git a/applications/base/services/kube-prometheus-stack/helm-values/prometheus-overrides.yaml b/applications/base/services/observability/kube-prometheus-stack/helm-values/prometheus-overrides.yaml similarity index 100% rename from applications/base/services/kube-prometheus-stack/helm-values/prometheus-overrides.yaml rename to applications/base/services/observability/kube-prometheus-stack/helm-values/prometheus-overrides.yaml diff --git a/applications/base/services/kube-prometheus-stack/helmrelease.yaml b/applications/base/services/observability/kube-prometheus-stack/helmrelease.yaml similarity index 100% rename from applications/base/services/kube-prometheus-stack/helmrelease.yaml rename to applications/base/services/observability/kube-prometheus-stack/helmrelease.yaml diff --git a/applications/base/services/kube-prometheus-stack/kustomization.yaml b/applications/base/services/observability/kube-prometheus-stack/kustomization.yaml similarity index 95% rename from applications/base/services/kube-prometheus-stack/kustomization.yaml rename to applications/base/services/observability/kube-prometheus-stack/kustomization.yaml index c538567..731d6be 100644 --- a/applications/base/services/kube-prometheus-stack/kustomization.yaml +++ b/applications/base/services/observability/kube-prometheus-stack/kustomization.yaml @@ -2,7 +2,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - "namespace.yaml" - "source.yaml" - "helmrelease.yaml" secretGenerator: diff --git a/applications/base/services/kube-prometheus-stack/source.yaml b/applications/base/services/observability/kube-prometheus-stack/source.yaml similarity index 100% rename from applications/base/services/kube-prometheus-stack/source.yaml rename to applications/base/services/observability/kube-prometheus-stack/source.yaml diff --git a/applications/base/services/kube-prometheus-stack/namespace.yaml b/applications/base/services/observability/namespace/namespace.yaml similarity index 100% rename from applications/base/services/kube-prometheus-stack/namespace.yaml rename to applications/base/services/observability/namespace/namespace.yaml diff --git a/applications/base/services/opentelemetry-kube-stack/README.md b/applications/base/services/observability/opentelemetry-kube-stack/README.md similarity index 100% rename from applications/base/services/opentelemetry-kube-stack/README.md rename to applications/base/services/observability/opentelemetry-kube-stack/README.md diff --git a/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml b/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml similarity index 100% rename from applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml rename to applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml diff --git a/applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml b/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml similarity index 100% rename from applications/base/services/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml rename to applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml diff --git a/applications/base/services/opentelemetry-kube-stack/helmrelease.yaml b/applications/base/services/observability/opentelemetry-kube-stack/helmrelease.yaml similarity index 100% rename from applications/base/services/opentelemetry-kube-stack/helmrelease.yaml rename to applications/base/services/observability/opentelemetry-kube-stack/helmrelease.yaml diff --git a/applications/base/services/opentelemetry-kube-stack/kustomization.yaml b/applications/base/services/observability/opentelemetry-kube-stack/kustomization.yaml similarity index 83% rename from applications/base/services/opentelemetry-kube-stack/kustomization.yaml rename to applications/base/services/observability/opentelemetry-kube-stack/kustomization.yaml index c30089d..757cc5a 100644 --- a/applications/base/services/opentelemetry-kube-stack/kustomization.yaml +++ b/applications/base/services/observability/opentelemetry-kube-stack/kustomization.yaml @@ -2,7 +2,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - "./namespace.yaml" - "./source.yaml" - "./helmrelease.yaml" @@ -11,4 +10,4 @@ secretGenerator: type: Opaque files: [hardened.yaml=helm-values/hardened-values-v0.11.1.yaml] options: - disableNameSuffixHash: true \ No newline at end of file + disableNameSuffixHash: true diff --git a/applications/base/services/opentelemetry-kube-stack/source.yaml b/applications/base/services/observability/opentelemetry-kube-stack/source.yaml similarity index 100% rename from applications/base/services/opentelemetry-kube-stack/source.yaml rename to applications/base/services/observability/opentelemetry-kube-stack/source.yaml diff --git a/applications/base/services/opentelemetry-kube-stack/namespace.yaml b/applications/base/services/opentelemetry-kube-stack/namespace.yaml deleted file mode 100644 index 09f0d46..0000000 --- a/applications/base/services/opentelemetry-kube-stack/namespace.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -apiVersion: v1 -kind: Namespace -metadata: - name: observability - labels: - pod-security.kubernetes.io/enforce: privileged - pod-security.kubernetes.io/enforce-version: latest - pod-security.kubernetes.io/warn: baseline - pod-security.kubernetes.io/audit: baseline From 608803432e591bad2184d86fe560485608e3160d Mon Sep 17 00:00:00 2001 From: Pratik Bandarkar Date: Wed, 5 Nov 2025 13:18:23 +0000 Subject: [PATCH 06/10] feat: Create base manifest files to deploy loki (#41) --- .../services/observability/loki/README.md | 15 + .../helm-values/hardened-values-v6.45.2.yaml | 4323 +++++++++++++++++ .../observability/loki/helmrelease.yaml | 36 + .../observability/loki/kustomization.yaml | 14 + .../services/observability/loki/source.yaml | 9 + .../hardened-values-v0.11.1 copy.yaml | 233 - 6 files changed, 4397 insertions(+), 233 deletions(-) create mode 100644 applications/base/services/observability/loki/README.md create mode 100644 applications/base/services/observability/loki/helm-values/hardened-values-v6.45.2.yaml create mode 100644 applications/base/services/observability/loki/helmrelease.yaml create mode 100644 applications/base/services/observability/loki/kustomization.yaml create mode 100644 applications/base/services/observability/loki/source.yaml delete mode 100644 applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml diff --git a/applications/base/services/observability/loki/README.md b/applications/base/services/observability/loki/README.md new file mode 100644 index 0000000..2a0cbe5 --- /dev/null +++ b/applications/base/services/observability/loki/README.md @@ -0,0 +1,15 @@ +# Loki – Base Configuration + +This directory contains the **base manifests** for deploying [Grafana Loki](https://grafana.com/oss/loki/), a horizontally-scalable, highly-available log aggregation system designed for cloud-native environments. +It is designed to be **consumed by cluster repositories** as a remote base, allowing each cluster to apply **custom overrides** as needed. + +**About Grafana Loki:** + +- Provides a **cost-effective log aggregation solution** optimized for storing and querying logs from Kubernetes clusters and applications. +- Deployed in **Simple Scalable mode** with separate read and write paths for high availability and horizontal scaling. +- Integrates natively with **OpenTelemetry** for log collection using OTLP protocol, eliminating the need for additional log shippers. +- Indexes only metadata (labels) rather than full-text, resulting in **significantly lower storage costs** compared to traditional solutions. +- Queries logs using **LogQL**, a query language similar to PromQL, enabling powerful filtering and aggregation. +- Supports **multi-tenancy**, **retention policies**, and **compaction** for efficient long-term log storage. +- Automatically integrates with **Grafana** for unified visualization of logs alongside metrics and traces. +- Commonly used for troubleshooting application issues, audit logging, security analysis, and operational insights. diff --git a/applications/base/services/observability/loki/helm-values/hardened-values-v6.45.2.yaml b/applications/base/services/observability/loki/helm-values/hardened-values-v6.45.2.yaml new file mode 100644 index 0000000..2be3d37 --- /dev/null +++ b/applications/base/services/observability/loki/helm-values/hardened-values-v6.45.2.yaml @@ -0,0 +1,4323 @@ +# -- Overrides the version used to determine compatibility of resources with the target Kubernetes cluster. +# This is useful when using `helm template`, because then helm will use the client version of kubectl as the Kubernetes version, +# which may or may not match your cluster's server version. Example: 'v1.24.4'. Set to null to use the version that helm +# devises. +kubeVersionOverride: null + +global: + # -- Overrides the Docker registry globally for all images (standard format) + imageRegistry: null + image: + # -- Overrides the Docker registry globally for all images (deprecated, use global.imageRegistry) + registry: null + # -- Overrides the priorityClassName for all pods + priorityClassName: null + # -- configures cluster domain ("cluster.local" by default) + clusterDomain: "cluster.local" + # -- configures DNS service name + dnsService: "kube-dns" + # -- configures DNS service namespace + dnsNamespace: "kube-system" + # -- Common additional CLI arguments for all jobs (that is, -log.level debug, -config.expand-env=true or -log-config-reverse-order) + # scope: admin-api, backend, bloom-builder, bloom-gateway, bloom-planner, compactor, distributor, index-gateway, ingester, overrides-exporter, pattern-ingester, querier, query-frontend, query-scheduler, read, ruler, write. + extraArgs: + - -config.expand-env=true + # -- Common environment variables to add to all pods directly managed by this chart. + # scope: admin-api, backend, bloom-builder, bloom-gateway, bloom-planner, compactor, distributor, index-gateway, ingester, overrides-exporter, pattern-ingester, querier, query-frontend, query-scheduler, read, ruler, write. + extraEnv: [] + # -- Common source of environment injections to add to all pods directly managed by this chart. + # scope: admin-api, backend, bloom-builder, bloom-gateway, bloom-planner, compactor, distributor, index-gateway, ingester, overrides-exporter, pattern-ingester, querier, query-frontend, query-scheduler, read, ruler, write. + # For example to inject values from a Secret, use: + # extraEnvFrom: + # - secretRef: + # name: mysecret + extraEnvFrom: [] + # -- Common volumes to add to all pods directly managed by this chart. + # scope: admin-api, backend, bloom-builder, bloom-gateway, bloom-planner, compactor, distributor, index-gateway, ingester, overrides-exporter, pattern-ingester, querier, query-frontend, query-scheduler, read, ruler, write. + extraVolumes: [] + # -- Common mount points to add to all pods directly managed by this chart. + # scope: admin-api, backend, bloom-builder, bloom-gateway, bloom-planner, compactor, distributor, index-gateway, ingester, overrides-exporter, pattern-ingester, querier, query-frontend, query-scheduler, read, ruler, write. + extraVolumeMounts: [] +# -- Overrides the chart's name +nameOverride: null +# -- Overrides the chart's computed fullname +fullnameOverride: null +# -- Overrides the chart's namespace +namespaceOverride: null +# -- Overrides the chart's cluster label +clusterLabelOverride: null +# -- Image pull secrets for Docker images +imagePullSecrets: [] +# -- Deployment mode lets you specify how to deploy Loki. +# There are 3 options: +# - SingleBinary: Loki is deployed as a single binary, useful for small installs typically without HA, up to a few tens of GB/day. +# - SimpleScalable: Loki is deployed as 3 targets: read, write, and backend. Useful for medium installs easier to manage than distributed, up to a about 1TB/day. +# - Distributed: Loki is deployed as individual microservices. The most complicated but most capable, useful for large installs, typically over 1TB/day. +# There are also 2 additional modes used for migrating between deployment modes: +# - SingleBinary<->SimpleScalable: Migrate from SingleBinary to SimpleScalable (or vice versa) +# - SimpleScalable<->Distributed: Migrate from SimpleScalable to Distributed (or vice versa) +# Note: SimpleScalable and Distributed REQUIRE the use of object storage. +deploymentMode: SimpleScalable +###################################################################################################################### +# +# Base Loki Configs including kubernetes configurations and configurations for Loki itself, +# see below for more specifics on Loki's configuration. +# +###################################################################################################################### +# -- Configuration for running Loki +# @default -- See values.yaml +loki: + # Configures the liveness probe for all of the Loki pods + livenessProbe: + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + # Configures the readiness probe for all of the Loki pods + readinessProbe: + httpGet: + path: /ready + port: http-metrics + periodSeconds: 10 + initialDelaySeconds: 15 + successThreshold: 1 + failureThreshold: 6 + timeoutSeconds: 1 + # Configures the startup probe for all of the Loki pods + startupProbe: {} + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/loki + # -- Overrides the image tag whose default is the chart's appVersion + tag: 3.5.7 + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Common annotations for all deployments/StatefulSets + annotations: {} + # -- Common annotations for all pods + podAnnotations: {} + # -- Common labels for all pods + podLabels: {} + # -- Common annotations for all services + serviceAnnotations: {} + # -- Common labels for all services + serviceLabels: {} + # -- The number of old ReplicaSets to retain to allow rollback + revisionHistoryLimit: 10 + # -- The SecurityContext for Loki pods + podSecurityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + # -- The SecurityContext for Loki containers + containerSecurityContext: + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + allowPrivilegeEscalation: false + # -- Should enableServiceLinks be enabled. Default to enable + enableServiceLinks: true + # -- DNS config for Loki pods + dnsConfig: {} + ###################################################################################################################### + # + # Loki Configuration + # + # There are several ways to pass configuration to Loki, listing them here in order of our preference for how + # you should use this chart. + # 1. Use the templated value of loki.config below and the corresponding override sections which follow. + # This allows us to set a lot of important Loki configurations and defaults and also allows us to maintain them + # over time as Loki changes and evolves. + # 2. Use the loki.structuredConfig section. + # This will completely override the templated value of loki.config, so you MUST provide the entire Loki config + # including any configuration that we set in loki.config unless you explicitly are trying to change one of those + # values and are not able to do so with the templated sections. + # If you choose this approach the burden is on you to maintain any changes we make to the templated config. + # 3. Use an existing secret or configmap to provide the configuration. + # This option is mostly provided for folks who have external processes which provide or modify the configuration. + # When using this option you can specify a different name for loki.generatedConfigObjectName and configObjectName + # if you have a process which takes the generated config and modifies it, or you can stop the chart from generating + # a config entirely by setting loki.generatedConfigObjectName to + # + ###################################################################################################################### + + # -- Defines what kind of object stores the configuration, a ConfigMap or a Secret. + # In order to move sensitive information (such as credentials) from the ConfigMap/Secret to a more secure location (e.g. vault), it is possible to use [environment variables in the configuration](https://grafana.com/docs/loki/latest/configuration/#use-environment-variables-in-the-configuration). + # Such environment variables can be then stored in a separate Secret and injected via the global.extraEnvFrom value. For details about environment injection from a Secret please see [Secrets](https://kubernetes.io/docs/concepts/configuration/secret/#use-case-as-container-environment-variables). + configStorageType: ConfigMap + # -- The name of the object which Loki will mount as a volume containing the config. + # If the configStorageType is Secret, this will be the name of the Secret, if it is ConfigMap, this will be the name of the ConfigMap. + # The value will be passed through tpl. + configObjectName: '{{ include "loki.name" . }}' + # -- The name of the Secret or ConfigMap that will be created by this chart. + # If empty, no configmap or secret will be created. + # The value will be passed through tpl. + generatedConfigObjectName: '{{ include "loki.name" . }}' + # -- Config file contents for Loki + # @default -- See values.yaml + config: | + {{- if .Values.enterprise.enabled}} + {{- tpl .Values.enterprise.config . }} + {{- else }} + auth_enabled: {{ .Values.loki.auth_enabled }} + {{- end }} + + {{- with .Values.loki.server }} + server: + {{- toYaml . | nindent 2}} + {{- end}} + + {{- with .Values.loki.pattern_ingester }} + pattern_ingester: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + memberlist: + {{- if .Values.loki.memberlistConfig }} + {{- toYaml .Values.loki.memberlistConfig | nindent 2 }} + {{- else }} + {{- if .Values.loki.extraMemberlistConfig}} + {{- toYaml .Values.loki.extraMemberlistConfig | nindent 2}} + {{- end }} + join_members: + - {{ include "loki.memberlist" . }} + {{- with .Values.migrate.fromDistributed }} + {{- if .enabled }} + - {{ .memberlistService }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .Values.loki.ingester }} + ingester: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.ingester_client }} + ingester_client: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.block_builder }} + block_builder: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- if .Values.loki.commonConfig}} + common: + {{- toYaml .Values.loki.commonConfig | nindent 2}} + storage: + {{- include "loki.commonStorageConfig" . | nindent 4}} + {{- end}} + + {{- with .Values.loki.limits_config }} + limits_config: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + runtime_config: + file: /etc/loki/runtime-config/runtime-config.yaml + + {{- if .Values.chunksCache.enabled }} + {{- with .Values.chunksCache }} + chunk_store_config: + chunk_cache_config: + default_validity: {{ .defaultValidity }} + background: + writeback_goroutines: {{ .writebackParallelism }} + writeback_buffer: {{ .writebackBuffer }} + writeback_size_limit: {{ .writebackSizeLimit }} + memcached: + batch_size: {{ .batchSize }} + parallelism: {{ .parallelism }} + memcached_client: + addresses: {{ .addresses }} + consistent_hash: true + timeout: {{ .timeout }} + max_idle_conns: 72 + {{- end }} + {{- with .Values.chunksCache.l2 }} + {{- if .enabled }} + l2_chunk_cache_handoff: {{ .l2ChunkCacheHandoff }} + chunk_cache_config_l2: + default_validity: {{ .defaultValidity }} + background: + writeback_goroutines: {{ .writebackParallelism }} + writeback_buffer: {{ .writebackBuffer }} + writeback_size_limit: {{ .writebackSizeLimit }} + memcached: + batch_size: {{ .batchSize }} + parallelism: {{ .parallelism }} + memcached_client: + addresses: {{ .addresses }} + consistent_hash: true + timeout: {{ .timeout }} + max_idle_conns: 72 + {{- end }} + {{- end }} + {{- end }} + + {{- if .Values.loki.schemaConfig }} + schema_config: + {{- toYaml .Values.loki.schemaConfig | nindent 2}} + {{- end }} + + {{- if .Values.loki.useTestSchema }} + schema_config: + {{- toYaml .Values.loki.testSchemaConfig | nindent 2}} + {{- end }} + + {{- if .Values.ruler.enabled }} + {{ include "loki.rulerConfig" . }} + {{- end }} + + {{- if and .Values.loki.storage.use_thanos_objstore .Values.ruler.enabled}} + ruler_storage: + {{- include "loki.rulerThanosStorageConfig" . | nindent 2 }} + {{- end }} + + {{- if or .Values.tableManager.retention_deletes_enabled .Values.tableManager.retention_period }} + table_manager: + retention_deletes_enabled: {{ .Values.tableManager.retention_deletes_enabled }} + retention_period: {{ .Values.tableManager.retention_period }} + {{- end }} + + query_range: + align_queries_with_step: true + {{- with .Values.loki.query_range }} + {{- tpl (. | toYaml) $ | nindent 2 }} + {{- end }} + {{- if .Values.resultsCache.enabled }} + {{- with .Values.resultsCache }} + cache_results: true + results_cache: + cache: + default_validity: {{ .defaultValidity }} + background: + writeback_goroutines: {{ .writebackParallelism }} + writeback_buffer: {{ .writebackBuffer }} + writeback_size_limit: {{ .writebackSizeLimit }} + memcached_client: + addresses: {{ .addresses }} + consistent_hash: true + timeout: {{ .timeout }} + update_interval: 1m + {{- end }} + {{- end }} + + {{- with .Values.loki.storage_config }} + storage_config: + {{- if not (hasKey $.Values.loki.storage_config "use_thanos_objstore") }} + use_thanos_objstore: {{ $.Values.loki.storage.use_thanos_objstore }} + {{- end }} + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.query_scheduler }} + query_scheduler: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.compactor }} + compactor: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.compactor_grpc_client }} + compactor_grpc_client: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.analytics }} + analytics: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- if .Values.loki.ui.enabled }} + ui: + enabled: true + {{- end }} + {{- with .Values.loki.querier }} + querier: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.index_gateway }} + index_gateway: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.frontend }} + frontend: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.frontend_worker }} + frontend_worker: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.distributor }} + distributor: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + tracing: + enabled: {{ .Values.loki.tracing.enabled }} + + {{- with .Values.loki.bloom_build }} + bloom_build: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.bloom_gateway }} + bloom_gateway: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.operational_config }} + operational_config: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + # Should authentication be enabled + auth_enabled: true + # -- memberlist configuration (overrides embedded default) + memberlistConfig: {} + # -- Extra memberlist configuration + extraMemberlistConfig: {} + # -- Tenants list to be created on nginx htpasswd file, with name and password or passwordHash keys

+ # Example: + #
+  # tenants:
+ # - name: "test-user-1"
+ # password: "test-password-1"
+ # - name: "test-user-2"
+ # passwordHash: "$2y$10$7O40CaY1yz7fu9O24k2/u.ct/wELYHRBsn25v/7AyuQ8E8hrLqpva" # generated using `htpasswd -nbBC10 test-user-2 test-password-2` + #
+ tenants: [] + # -- Check https://grafana.com/docs/loki/latest/configuration/#server for more info on the server configuration. + server: + http_listen_port: 3100 + grpc_listen_port: 9095 + http_server_read_timeout: 600s + http_server_write_timeout: 600s + # -- Limits config + limits_config: + # global retention (override per-tenant if needed) + retention_period: 30d + ingestion_rate_mb: 8 + ingestion_burst_size_mb: 16 + unordered_writes: true + reject_old_samples: true + # 7d + reject_old_samples_max_age: 168h + query_timeout: 2m + max_cache_freshness_per_query: 10m + max_query_parallelism: 32 + per_stream_rate_limit: 3MB + per_stream_rate_limit_burst: 6MB + # -- Provides a reloadable runtime configuration file for some specific configuration + runtimeConfig: {} + # -- Check https://grafana.com/docs/loki/latest/configuration/#common_config for more info on how to provide a common configuration + commonConfig: + path_prefix: /var/loki + replication_factor: 3 + # -- The gRPC address of the compactor. The use of compactor_grpc_address is prefered over compactor_address. + # If a customized compactor_address is set, compactor_grpc_address should be set to an empty string. + compactor_grpc_address: '{{ include "loki.compactorAddress" . }}' + # -- Storage config. Providing this will automatically populate all necessary storage configs in the templated config. + # -- In case of using thanos storage, enable use_thanos_objstore and the configuration should be done inside the object_store section. + storage: + # Loki requires a bucket for chunks and the ruler. GEL requires a third bucket for the admin API. + # Please provide these values if you are using object storage. + # bucketNames: + # chunks: FIXME + # ruler: FIXME + # admin: FIXME + type: s3 + s3: + s3: null + endpoint: null + region: null + secretAccessKey: null + accessKeyId: null + signatureVersion: null + s3ForcePathStyle: false + insecure: false + http_config: {} + # -- Check https://grafana.com/docs/loki/latest/configure/#s3_storage_config for more info on how to provide a backoff_config + backoff_config: {} + disable_dualstack: false + gcs: + chunkBufferSize: 0 + requestTimeout: "0s" + enableHttp2: true + azure: + accountName: null + accountKey: null + connectionString: null + useManagedIdentity: false + useFederatedToken: false + userAssignedId: null + requestTimeout: null + endpointSuffix: null + chunkDelimiter: null + swift: + auth_version: null + auth_url: null + internal: null + username: null + user_domain_name: null + user_domain_id: null + user_id: null + password: null + domain_id: null + domain_name: null + project_id: null + project_name: null + project_domain_id: null + project_domain_name: null + region_name: null + container_name: null + max_retries: null + connect_timeout: null + request_timeout: null + filesystem: + chunks_directory: /var/loki/chunks + rules_directory: /var/loki/rules + + # Loki now supports using thanos storage clients for connecting to object storage backend. + # This will become the default way to configure storage in a future releases. + use_thanos_objstore: false + + object_store: + # Type of object store. Valid options are: s3, gcs, azure + type: s3 + # Optional prefix for storage keys + storage_prefix: null + # S3 configuration (when type is "s3") + s3: + # S3 endpoint URL + endpoint: null + # Optional region + region: null + # Optional access key + access_key_id: null + # Optional secret key + secret_access_key: null + # Optional. Enable if using self-signed TLS + insecure: false + # Optional server-side encryption configuration + sse: {} + # Optional HTTP client configuration + http: {} + + # GCS configuration (when type is "gcs") + gcs: + # Name of the bucket + bucket_name: null + # Optional service account JSON + service_account: null + + # Azure configuration (when type is "azure") + azure: + # Storage account name + account_name: null + # Optional storage account key + account_key: null + + # -- Check https://grafana.com/docs/loki/latest/configuration/#schema_config for more info on how to configure schemas + schemaConfig: + configs: + - from: "2025-01-01" + store: tsdb + object_store: swift + schema: v13 + index: + prefix: index_ + period: 24h + # -- a real Loki install requires a proper schemaConfig defined above this, however for testing or playing around + # you can enable useTestSchema + useTestSchema: false + testSchemaConfig: + configs: + - from: 2024-04-01 + store: tsdb + object_store: '{{ include "loki.testSchemaObjectStore" . }}' + schema: v13 + index: + prefix: index_ + period: 24h + ## A separate loki ruler storage configuration can be provided via rulerStorage.storage section: + ## rulerConfig: + ## storage: + ## type: local + # -- Check https://grafana.com/docs/loki/latest/configuration/#ruler for more info on configuring ruler + rulerConfig: + wal: + dir: /var/loki/ruler-wal + # -- Storage for the ruler. If defining rules in `ruler.directories`, this must be configured to use local storage as shown below. + # storage: + # type: local + # local: + # directory: /etc/loki/rules + # -- Structured loki configuration, takes precedence over `loki.config`, `loki.schemaConfig`, `loki.storageConfig` + structuredConfig: {} + # -- Additional query scheduler config + query_scheduler: {} + # -- Additional storage config + storage_config: + boltdb_shipper: + index_gateway_client: + server_address: '{{ include "loki.indexGatewayAddress" . }}' + tsdb_shipper: + index_gateway_client: + server_address: '{{ include "loki.indexGatewayAddress" . }}' + bloom_shipper: + working_directory: /var/loki/data/bloomshipper + hedging: + at: "250ms" + max_per_second: 20 + up_to: 3 + # -- Optional compactor configuration + compactor: {} + # -- Optional compactor grpc client configuration + compactor_grpc_client: {} + # -- Optional pattern ingester configuration + pattern_ingester: + enabled: false + # -- Optional analytics configuration + analytics: {} + # -- Optional Loki UI: Provides access to a operators UI for Loki distributed. When enabled UI will be available at /ui/ of loki-gateway + ui: + # Disabled by default for backwards compatibility. Enable to use the Loki UI. + enabled: false + gateway: + # enable gateway proxying to UI under /ui + enabled: true + # -- Optional querier configuration + query_range: {} + # -- Optional querier configuration + querier: {} + # -- Optional ingester configuration + ingester: {} + # -- Optional ingester client configuration + ingester_client: {} + # -- Optional block builder configuration + block_builder: {} + # -- Optional index gateway configuration + index_gateway: + mode: simple + frontend: + scheduler_address: '{{ include "loki.querySchedulerAddress" . }}' + tail_proxy_url: '{{ include "loki.querierAddress" . }}' + frontend_worker: + scheduler_address: '{{ include "loki.querySchedulerAddress" . }}' + # -- Optional distributor configuration + distributor: {} + # -- Enable tracing + tracing: + enabled: false + bloom_build: + enabled: false + builder: + planner_address: '{{ include "loki.bloomPlannerAddress" . }}' + bloom_gateway: + enabled: false + client: + addresses: '{{ include "loki.bloomGatewayAddresses" . }}' + # -- Optional operational configuration + operational_config: {} +###################################################################################################################### +# +# Enterprise Loki Configs +# +###################################################################################################################### + +# -- Configuration for running Enterprise Loki +enterprise: + # Enable enterprise features, license must be provided + enabled: false + # Default version of GEL to deploy + version: 3.5.4 + # -- Optional name of the GEL cluster, otherwise will use .Release.Name + # The cluster name must match what is in your GEL license + cluster_name: null + # -- Grafana Enterprise Logs license + # In order to use Grafana Enterprise Logs features, you will need to provide + # the contents of your Grafana Enterprise Logs license, either by providing the + # contents of the license.jwt, or the name Kubernetes Secret that contains your + # license.jwt. + # To set the license contents, use the flag `--set-file 'enterprise.license.contents=./license.jwt'` + license: + contents: "NOTAVALIDLICENSE" + # -- Set to true when providing an external license + useExternalLicense: false + # -- Name of external license secret to use + externalLicenseName: null + # -- Name of the external config secret to use + externalConfigName: "" + # -- Use GEL gateway, if false will use the default nginx gateway + gelGateway: true + # -- If enabled, the correct admin_client storage will be configured. If disabled while running enterprise, + # make sure auth is set to `type: trust`, or that `auth_enabled` is set to `false`. + adminApi: + enabled: true + # enterprise specific sections of the config.yaml file + config: | + {{- if .Values.enterprise.adminApi.enabled }} + admin_client: + {{ include "enterprise-logs.adminAPIStorageConfig" . | nindent 2 }} + {{ end }} + auth: + type: {{ .Values.enterprise.adminApi.enabled | ternary "enterprise" "trust" }} + auth_enabled: {{ .Values.loki.auth_enabled }} + cluster_name: {{ include "loki.clusterName" . }} + license: + path: /etc/loki/license/license.jwt + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/enterprise-logs + # -- Docker image tag + tag: 3.5.4 + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + adminToken: + # -- Name of external secret containing the admin token for enterprise provisioner + # This secret must exist before deploying and must contain a key named 'token' + secret: null + # -- Alternative name of the secret to store token for the canary + canarySecret: null + # -- Configuration for `provisioner` target + # Note: Uses enterprise.adminToken.secret value to mount the admin token used to call the admin api. + provisioner: + # -- Whether the job should be part of the deployment + enabled: true + # -- Name of the secret to store provisioned tokens in + provisionedSecretPrefix: null + # -- Hook type(s) to customize when the job runs. defaults to post-install + hookType: "post-install" + # -- url of the admin api to use for the provisioner + apiUrl: '{{ include "loki.address" . }}' + # -- Additional tenants to be created. Each tenant will get a read and write policy + # and associated token. Tenant must have a name and a namespace for the secret containting + # the token to be created in. For example + # additionalTenants: + # - name: loki + # secretNamespace: grafana + additionalTenants: [] + # -- Additional Kubernetes environment + env: [] + # -- Additional labels for the `provisioner` Job + labels: {} + # -- Additional annotations for the `provisioner` Job + annotations: {} + # -- Affinity for provisioner Pods + # The value will be passed through tpl. + affinity: {} + # -- Node selector for provisioner Pods + nodeSelector: {} + # -- Tolerations for provisioner Pods + tolerations: [] + # -- The name of the PriorityClass for provisioner Job + priorityClassName: null + # -- Use the host's user namespace in provisioner pods + hostUsers: nil + # -- Run containers as user `enterprise-logs(uid=10001)` + securityContext: + runAsNonRoot: true + runAsGroup: 10001 + runAsUser: 10001 + fsGroup: 10001 + # -- Provisioner image to Utilize + image: + # -- The Docker registry + registry: us-docker.pkg.dev + # -- Docker image repository + repository: grafanalabs-global/docker-enterprise-provisioner-prod/enterprise-provisioner + # -- Overrides the image tag whose default is the chart's appVersion + tag: latest + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Volume mounts to add to the provisioner pods + extraVolumeMounts: [] + # -- Additional volumes for Pods + extraVolumes: [] +###################################################################################################################### +# +# Chart Testing +# +###################################################################################################################### + +# -- Section for configuring optional Helm test +test: + enabled: true + # -- Used to directly query the metrics endpoint of the canary for testing, this approach avoids needing prometheus for testing. + # This in a newer approach to using prometheusAddress such that tests do not have a dependency on prometheus + canaryServiceAddress: "http://loki-canary:3500/metrics" + # -- Address of the prometheus server to query for the test. This overrides any value set for canaryServiceAddress. + # This is kept for backward compatibility and may be removed in future releases. Previous value was 'http://prometheus:9090' + prometheusAddress: "" + # -- Number of times to retry the test before failing + timeout: 1m + # -- Additional labels for the test pods + labels: {} + # -- Additional annotations for test pods + annotations: {} + # -- Image to use for loki canary + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/loki-helm-test + # -- Overrides the image tag whose default is the chart's appVersion + tag: "latest" + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Use the host's user namespace in test pods + hostUsers: nil +# The Loki canary pushes logs to and queries from this loki installation to test +# that it's working correctly +lokiCanary: + enabled: true + # -- The type of the loki canary k8s rollout. This can be a DaemonSet or Deployment. + kind: DaemonSet + # -- If true, the canary will send directly to Loki via the address configured for verification -- + # -- If false, it will write to stdout and an Agent will be needed to scrape and send the logs -- + push: true + # -- If set overwrites the default value set by loki.host helper function. Use this if gateway not enabled. + lokiurl: null + # -- The name of the label to look for at loki when doing the checks. + labelname: pod + # -- Additional annotations for the `loki-canary` Daemonset + annotations: {} + # -- Additional labels for each `loki-canary` pod + podLabels: {} + service: + # -- Annotations for loki-canary Service + annotations: {} + # -- Additional labels for loki-canary Service + labels: {} + # -- Additional CLI arguments for the `loki-canary' command + extraArgs: [] + # -- Environment variables to add to the canary pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the canary pods + extraEnvFrom: [] + # -- Volume mounts to add to the canary pods + extraVolumeMounts: [] + # -- Volumes to add to the canary pods + extraVolumes: [] + # -- Resource requests and limits for the canary + resources: {} + # -- DNS config for canary pods + dnsConfig: {} + # -- Node selector for canary pods + nodeSelector: {} + # -- Tolerations for canary pods + tolerations: [] + # -- Affinity for canary pods + affinity: {} + # -- The name of the PriorityClass for loki-canary pods + priorityClassName: null + # -- Use the host's user namespace in loki-canary pods + hostUsers: nil + # -- Image to use for loki canary + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/loki-canary + # -- Overrides the image tag whose default is the chart's appVersion + tag: null + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Update strategy for the `loki-canary` Daemonset pods + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + # -- Replicas for `loki-canary` when using a Deployment + replicas: 1 +###################################################################################################################### +# +# Service Accounts and Kubernetes RBAC +# +###################################################################################################################### +serviceAccount: + # -- Specifies whether a ServiceAccount should be created + create: true + # -- The name of the ServiceAccount to use. + # If not set and create is true, a name is generated using the fullname template + name: null + # -- Image pull secrets for the service account + imagePullSecrets: [] + # -- Annotations for the service account + annotations: {} + # -- Labels for the service account + labels: {} + # -- Set this toggle to false to opt out of automounting API credentials for the service account + automountServiceAccountToken: true +# RBAC configuration +rbac: + # -- If pspEnabled true, a PodSecurityPolicy is created for K8s that use psp. + pspEnabled: false + # -- For OpenShift set pspEnabled to 'false' and sccEnabled to 'true' to use the SecurityContextConstraints. + sccEnabled: false + # -- Toggle this to true to allow the use of hostPath volumes on OpenShift + sccAllowHostDirVolumePlugin: false + # -- Specify PSP annotations + # Ref: https://kubernetes.io/docs/reference/access-authn-authz/psp-to-pod-security-standards/#podsecuritypolicy-annotations + pspAnnotations: {} + # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' + # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' + # -- Whether to install RBAC in the namespace only or cluster-wide. Useful if you want to watch ConfigMap globally. + namespaced: false +###################################################################################################################### +# +# Network Policy configuration +# +###################################################################################################################### +networkPolicy: + # -- Specifies whether Network Policies should be created + enabled: false + # -- Specifies whether the policies created will be standard Network Policies (flavor: kubernetes) + # or Cilium Network Policies (flavor: cilium) + flavor: kubernetes + metrics: + # -- Specifies the Pods which are allowed to access the metrics port. + # As this is cross-namespace communication, you also need the namespaceSelector. + podSelector: {} + # -- Specifies the namespaces which are allowed to access the metrics port + namespaceSelector: {} + # -- Specifies specific network CIDRs which are allowed to access the metrics port. + # In case you use namespaceSelector, you also have to specify your kubelet networks here. + # The metrics ports are also used for probes. + cidrs: [] + ingress: + # -- Specifies the Pods which are allowed to access the http port. + # As this is cross-namespace communication, you also need the namespaceSelector. + podSelector: {} + # -- Specifies the namespaces which are allowed to access the http port + namespaceSelector: {} + alertmanager: + # -- Specify the alertmanager port used for alerting + port: 9093 + # -- Specifies the alertmanager Pods. + # As this is cross-namespace communication, you also need the namespaceSelector. + podSelector: {} + # -- Specifies the namespace the alertmanager is running in + namespaceSelector: {} + externalStorage: + # -- Specify the port used for external storage, e.g. AWS S3 + ports: [] + # -- Specifies specific network CIDRs you want to limit access to + cidrs: [] + discovery: + # -- (int) Specify the port used for discovery + port: null + # -- Specifies the Pods labels used for discovery. + # As this is cross-namespace communication, you also need the namespaceSelector. + podSelector: {} + # -- Specifies the namespace the discovery Pods are running in + namespaceSelector: {} + egressWorld: + # -- Enable additional cilium egress rules to external world for write, read and backend. + enabled: false + egressKubeApiserver: + # -- Enable additional cilium egress rules to kube-apiserver for backend. + enabled: false +###################################################################################################################### +# +# Global memberlist configuration +# +###################################################################################################################### + +# Configuration for the memberlist service +memberlist: + service: + publishNotReadyAddresses: false + annotations: {} +###################################################################################################################### +# +# adminAPI configuration, enterprise only. +# +###################################################################################################################### + +# -- Configuration for the `admin-api` target +adminApi: + # -- Define the amount of instances + replicas: 1 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Additional CLI arguments for the `admin-api` target + extraArgs: {} + # -- Environment variables to add to the admin-api pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the admin-api pods + extraEnvFrom: [] + # -- Additional labels for the `admin-api` Deployment + labels: {} + # -- Additional annotations for the `admin-api` Deployment + annotations: {} + # -- DNSConfig for `admin-api` pods + dnsConfig: {} + # -- Additional labels and annotations for the `admin-api` Service + service: + labels: {} + annotations: {} + # -- Run container as user `enterprise-logs(uid=10001)` + # `fsGroup` must not be specified, because these security options are applied + # on container level not on Pod level. + podSecurityContext: + runAsNonRoot: true + runAsGroup: 10001 + runAsUser: 10001 + containerSecurityContext: + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + allowPrivilegeEscalation: false + # -- Update strategy + strategy: + type: RollingUpdate + # -- Liveness probe + livenessProbe: {} + # -- Readiness probe + readinessProbe: + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 45 + # -- Startup probe + startupProbe: {} + # -- Request and limit Kubernetes resources + # -- Values are defined in small.yaml and large.yaml + resources: {} + # -- Configure optional environment variables + env: [] + # -- Configure optional initContainers + initContainers: [] + # -- Configure optional extraContainers + extraContainers: [] + # -- Additional volumes for Pods + extraVolumes: [] + # -- Additional volume mounts for Pods + extraVolumeMounts: [] + # -- Affinity for admin-api Pods + # The value will be passed through tpl. + affinity: {} + # -- Node selector for admin-api Pods + nodeSelector: {} + # -- Topology Spread Constraints for admin-api pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for admin-api Pods + tolerations: [] + # -- Grace period to allow the admin-api to shutdown before it is killed + terminationGracePeriodSeconds: 60 + # -- Use the host's user namespace in admin-api pods + hostUsers: nil +###################################################################################################################### +# +# Gateway and Ingress +# +# By default this chart will deploy a Nginx container to act as a gateway which handles routing of traffic +# and can also do auth. +# +# If you would prefer you can optionally disable this and enable using k8s ingress to do the incoming routing. +# +###################################################################################################################### + +# Configuration for the gateway +gateway: + # -- Specifies whether the gateway should be enabled + enabled: true + # -- Number of replicas for the gateway + replicas: 1 + # -- Default container port + containerPort: 8080 + # -- Enable logging of 2xx and 3xx HTTP requests + verboseLogging: true + autoscaling: + # -- Enable autoscaling for the gateway + enabled: false + # -- Minimum autoscaling replicas for the gateway + minReplicas: 1 + # -- Maximum autoscaling replicas for the gateway + maxReplicas: 3 + # -- Target CPU utilisation percentage for the gateway + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the gateway + targetMemoryUtilizationPercentage: + # -- See `kubectl explain deployment.spec.strategy` for more + # -- ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy + # -- Behavior policies while scaling. + behavior: {} + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 60 + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + deploymentStrategy: + type: RollingUpdate + image: + # -- The Docker registry for the gateway image + registry: docker.io + # -- The gateway image repository + repository: nginxinc/nginx-unprivileged + # -- The gateway image tag + tag: 1.29-alpine + # -- Overrides the gateway image tag with an image digest + digest: null + # -- The gateway image pull policy + pullPolicy: IfNotPresent + # -- The name of the PriorityClass for gateway pods + priorityClassName: null + # -- Annotations for gateway deployment + annotations: {} + # -- Annotations for gateway pods + podAnnotations: {} + # -- Additional labels for gateway pods + podLabels: {} + # -- Additional CLI args for the gateway + extraArgs: [] + # -- Environment variables to add to the gateway pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the gateway pods + extraEnvFrom: [] + # -- Lifecycle for the gateway container + lifecycle: {} + # -- Volumes to add to the gateway pods + extraVolumes: [] + # -- Volume mounts to add to the gateway pods + extraVolumeMounts: [] + # -- The SecurityContext for gateway containers + podSecurityContext: + fsGroup: 101 + runAsGroup: 101 + runAsNonRoot: true + runAsUser: 101 + # -- The SecurityContext for gateway containers + containerSecurityContext: + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + allowPrivilegeEscalation: false + # -- Use the host's user namespace in the gateway + hostUsers: nil + # -- Resource requests and limits for the gateway + resources: {} + # -- Containers to add to the gateway pods + extraContainers: [] + # -- Grace period to allow the gateway to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for gateway pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: gateway + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- DNS config for gateway pods + dnsConfig: {} + # -- Node selector for gateway pods + nodeSelector: {} + # -- Topology Spread Constraints for gateway pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for gateway pods + tolerations: [] + # Gateway service configuration + service: + # -- Port of the gateway service + port: 80 + # -- Type of the gateway service + type: ClusterIP + # -- ClusterIP of the gateway service + clusterIP: null + # -- (int) Node port if service type is NodePort + nodePort: null + # -- Load balancer IPO address if service type is LoadBalancer + loadBalancerIP: null + # -- Annotations for the gateway service + annotations: {} + # -- Labels for gateway service + labels: {} + # Gateway ingress configuration + ingress: + # -- Specifies whether an ingress for the gateway should be created + enabled: false + # -- Ingress Class Name. MAY be required for Kubernetes versions >= 1.18 + ingressClassName: "" + # -- Annotations for the gateway ingress + annotations: {} + # -- Labels for the gateway ingress + labels: {} + # -- Hosts configuration for the gateway ingress, passed through the `tpl` function to allow templating + hosts: + - host: gateway.loki.example.com + paths: + - path: / + # -- pathType (e.g. ImplementationSpecific, Prefix, .. etc.) might also be required by some Ingress Controllers + # pathType: Prefix + # -- TLS configuration for the gateway ingress. Hosts passed through the `tpl` function to allow templating + tls: + - secretName: loki-gateway-tls + hosts: + - gateway.loki.example.com + # Basic auth configuration + basicAuth: + # -- Enables basic authentication for the gateway + enabled: false + # -- The basic auth username for the gateway + username: null + # -- The basic auth password for the gateway + password: null + # -- Uses the specified users from the `loki.tenants` list to create the htpasswd file. + # if `loki.tenants` is not set, the `gateway.basicAuth.username` and `gateway.basicAuth.password` are used. + # The value is templated using `tpl`. Override this to use a custom htpasswd, e.g. in case the default causes + # high CPU load. + # @default -- Either `loki.tenants` or `gateway.basicAuth.username` and `gateway.basicAuth.password`. + htpasswd: | + {{- with $tenants := .Values.loki.tenants }} + {{- range $t := $tenants }} + {{- $username := required "All tenants must have a 'name' set" $t.name }} + {{- if $passwordHash := $t.passwordHash }} + {{- printf "%s:%s\n" $username $passwordHash }} + {{- else if $password := $t.password }} + {{- printf "%s\n" (htpasswd $username $password) }} + {{- else }} + {{- fail "All tenants must have a 'password' or 'passwordHash' set" }} + {{- end }} + {{- end }} + {{- else }} + {{- printf "%s\n" (htpasswd (required "'gateway.basicAuth.username' is required" .Values.gateway.basicAuth.username) (required "'gateway.basicAuth.password' is required" .Values.gateway.basicAuth.password)) }} + {{- end }} + # -- Existing basic auth secret to use. Must contain '.htpasswd' + existingSecret: null + # -- liveness probe for the nginx container in the gateway pods. + livenessProbe: {} + # Configures the readiness probe for the gateway + readinessProbe: + httpGet: + path: / + port: http-metrics + initialDelaySeconds: 15 + timeoutSeconds: 1 + # -- startup probe for the nginx container in the gateway pods. + startupProbe: {} + nginxConfig: + # -- Which schema to be used when building URLs. Can be 'http' or 'https'. + schema: http + # -- Enable listener for IPv6, disable on IPv4-only systems + enableIPv6: true + # -- NGINX log format + logFormat: |- + main '$remote_addr - $remote_user [$time_local] $status ' + '"$request" $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + # -- Allows appending custom configuration to the server block + serverSnippet: "" + # -- Allows appending custom configuration to the http block, passed through the `tpl` function to allow templating + httpSnippet: "" + # -- Allows appending custom configuration inside every location block, useful for authentication or setting headers that are not inherited from the server block, passed through the `tpl` function to allow templating. + locationSnippet: >- + {{ if .Values.loki.tenants }}proxy_set_header X-Scope-OrgID $remote_user;{{ end }} + # -- Allows customizing the `client_max_body_size` directive + clientMaxBodySize: 4M + # -- Whether ssl should be appended to the listen directive of the server block or not. + ssl: false + # -- Override Read URL + customReadUrl: null + # -- Override Write URL + customWriteUrl: null + # -- Override Backend URL + customBackendUrl: null + # -- Allows overriding the DNS resolver address nginx will use. + resolver: "" + # -- Config file contents for Nginx. Passed through the `tpl` function to allow templating + # @default -- See values.yaml + file: | + {{- include "loki.nginxFile" . -}} +# -- If running enterprise and using the default enterprise gateway, configs go here. +enterpriseGateway: + # -- Define the amount of instances + replicas: 1 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the `gateway` pod + hostUsers: nil + # -- Additional CLI arguments for the `gateway` target + extraArgs: {} + # -- Environment variables from secrets or configmaps to add to the enterprise gateway pods + extraEnvFrom: [] + # -- Additional labels for the `gateway` Pod + labels: {} + # -- Additional annotations for the `gateway` Pod + annotations: {} + # -- Additional labels and annotations for the `gateway` Service + # -- Service overriding service type + service: + type: ClusterIP + labels: {} + annotations: {} + # -- Run container as user `enterprise-logs(uid=10001)` + podSecurityContext: + runAsNonRoot: true + runAsGroup: 10001 + runAsUser: 10001 + fsGroup: 10001 + containerSecurityContext: + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + allowPrivilegeEscalation: false + # -- If you want to use your own proxy URLs, set this to false. + useDefaultProxyURLs: true + # -- update strategy + strategy: + type: RollingUpdate + # -- Readiness probe + readinessProbe: + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 45 + # -- Request and limit Kubernetes resources + # -- Values are defined in small.yaml and large.yaml + resources: {} + # -- Configure optional environment variables + env: [] + # -- Configure optional initContainers + initContainers: [] + # -- Conifgure optional extraContainers + extraContainers: [] + # -- Additional volumes for Pods + extraVolumes: [] + # -- Additional volume mounts for Pods + extraVolumeMounts: [] + # -- Affinity for gateway Pods + # The value will be passed through tpl. + affinity: {} + # -- Node selector for gateway Pods + nodeSelector: {} + # -- Topology Spread Constraints for enterprise-gateway pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for gateway Pods + tolerations: [] + # -- Grace period to allow the gateway to shutdown before it is killed + terminationGracePeriodSeconds: 60 +# -- Ingress configuration Use either this ingress or the gateway, but not both at once. +# If you enable this, make sure to disable the gateway. +# You'll need to supply authn configuration for your ingress controller. +ingress: + enabled: false + ingressClassName: "" + annotations: {} + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: loki-distributed-basic-auth + # nginx.ingress.kubernetes.io/auth-secret-type: auth-map + # nginx.ingress.kubernetes.io/configuration-snippet: | + # proxy_set_header X-Scope-OrgID $remote_user; + labels: {} + # blackbox.monitoring.exclude: "true" + paths: + # -- Paths that are exposed by Loki Distributor. + # If deployment mode is Distributed, the requests are forwarded to the service: `{{"loki.distributorFullname"}}`. + # If deployment mode is SimpleScalable, the requests are forwarded to write k8s service: `{{"loki.writeFullname"}}`. + # If deployment mode is SingleBinary, the requests are forwarded to the central/single k8s service: `{{"loki.singleBinaryFullname"}}` + distributor: + - /api/prom/push + - /loki/api/v1/push + - /otlp/v1/logs + - /ui + # -- Paths that are exposed by Loki Query Frontend. + # If deployment mode is Distributed, the requests are forwarded to the service: `{{"loki.queryFrontendFullname"}}`. + # If deployment mode is SimpleScalable, the requests are forwarded to write k8s service: `{{"loki.readFullname"}}`. + # If deployment mode is SingleBinary, the requests are forwarded to the central/single k8s service: `{{"loki.singleBinaryFullname"}}` + queryFrontend: + - /api/prom/query + # this path covers labels and labelValues endpoints + - /api/prom/label + - /api/prom/series + - /api/prom/tail + - /loki/api/v1/query + - /loki/api/v1/query_range + - /loki/api/v1/tail + # this path covers labels and labelValues endpoints + - /loki/api/v1/label + - /loki/api/v1/labels + - /loki/api/v1/series + - /loki/api/v1/index/stats + - /loki/api/v1/index/volume + - /loki/api/v1/index/volume_range + - /loki/api/v1/format_query + - /loki/api/v1/detected_field + - /loki/api/v1/detected_fields + - /loki/api/v1/detected_labels + - /loki/api/v1/patterns + # -- Paths that are exposed by Loki Ruler. + # If deployment mode is Distributed, the requests are forwarded to the service: `{{"loki.rulerFullname"}}`. + # If deployment mode is SimpleScalable, the requests are forwarded to k8s service: `{{"loki.backendFullname"}}`. + # If deployment mode is SimpleScalable but `read.legacyReadTarget` is `true`, the requests are forwarded to k8s service: `{{"loki.readFullname"}}`. + # If deployment mode is SingleBinary, the requests are forwarded to the central/single k8s service: `{{"loki.singleBinaryFullname"}}` + ruler: + - /api/prom/rules + - /api/prom/api/v1/rules + - /api/prom/api/v1/alerts + - /loki/api/v1/rules + - /prometheus/api/v1/rules + - /prometheus/api/v1/alerts + # -- Paths that are exposed by Loki Compactor. + # If deployment mode is Distributed, the requests are forwarded to the service: `{{"loki.compactorFullname"}}`. + # If deployment mode is SimpleScalable, the requests are forwarded to k8s service: `{{"loki.backendFullname"}}`. + # If deployment mode is SingleBinary, the requests are forwarded to the central/single k8s service: `{{"loki.singleBinaryFullname"}}` + compactor: + - /loki/api/v1/delete + # -- Hosts configuration for the ingress, passed through the `tpl` function to allow templating + hosts: + - loki.example.com + # -- TLS configuration for the ingress. Hosts passed through the `tpl` function to allow templating + tls: [] +# - hosts: +# - loki.example.com +# secretName: loki-distributed-tls + +###################################################################################################################### +# +# Migration +# +###################################################################################################################### + +# -- Options that may be necessary when performing a migration from another helm chart +migrate: + # -- When migrating from a distributed chart like loki-distributed or enterprise-logs + fromDistributed: + # -- Set to true if migrating from a distributed helm chart + enabled: false + # -- If migrating from a distributed service, provide the distributed deployment's + # memberlist service DNS so the new deployment can join its ring. + memberlistService: "" +###################################################################################################################### +# +# Single Binary Deployment +# +# For small Loki installations up to a few 10's of GB per day, or for testing and development. +# +###################################################################################################################### + +# Configuration for the single binary node(s) +singleBinary: + # -- Number of replicas for the single binary + replicas: 0 + autoscaling: + # -- Enable autoscaling + enabled: false + # -- Minimum autoscaling replicas for the single binary + minReplicas: 1 + # -- Maximum autoscaling replicas for the single binary + maxReplicas: 3 + # -- Target CPU utilisation percentage for the single binary + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the single binary + targetMemoryUtilizationPercentage: + image: + # -- The Docker registry for the single binary image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the single binary image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the single binary image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for single binary pods + priorityClassName: null + # -- Annotations for single binary StatefulSet + annotations: {} + # -- Annotations for single binary pods + podAnnotations: {} + # -- Additional labels for each `single binary` pod + podLabels: {} + # -- Additional selector labels for each `single binary` pod + selectorLabels: {} + service: + # -- Annotations for single binary Service + annotations: {} + # -- Additional labels for single binary Service + labels: {} + # -- Service Type for single binary Service + type: "ClusterIP" + # -- Comma-separated list of Loki modules to load for the single binary + targetModule: "all" + # -- Labels for single binary service + extraArgs: [] + # -- Environment variables to add to the single binary pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the single binary pods + extraEnvFrom: [] + # -- Extra containers to add to the single binary loki pod + extraContainers: [] + # -- Init containers to add to the single binary pods + initContainers: [] + # -- Volume mounts to add to the single binary pods + extraVolumeMounts: [] + # -- Volumes to add to the single binary pods + extraVolumes: [] + # -- Resource requests and limits for the single binary + resources: {} + # -- Grace period to allow the single binary to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Use the host's user namespace in the single binary pods + hostUsers: nil + # -- Affinity for single binary pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: single-binary + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- DNS config for single binary pods + dnsConfig: {} + # -- Node selector for single binary pods + nodeSelector: {} + # -- Tolerations for single binary pods + tolerations: [] + persistence: + # -- What to do with the volume when the StatefulSet is scaled down. + whenScaled: Delete + # -- What to do with the volumes when the StatefulSet is deleted. + whenDeleted: Delete + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: true + # -- Enable StatefulSetRecreation for changes to PVC size. + # This means that the StatefulSet will be deleted, recreated (with the same name) and rolled when a change to the + # PVC size is detected. That way the PVC can be resized without manual intervention. + enableStatefulSetRecreationForSizeChange: false + # -- Enable persistent disk + enabled: true + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Selector for persistent disk + selector: null + # -- Annotations for volume claim + annotations: {} + # -- Labels for volume claim + labels: {} +###################################################################################################################### +# +# Simple Scalable Deployment (SSD) Mode +# +# For small to medium size Loki deployments up to around 1 TB/day, this is the default mode for this helm chart +# +###################################################################################################################### + +# Configuration for the write pod(s) +write: + # -- Number of replicas for the write + replicas: 3 + autoscaling: + # -- Enable autoscaling for the write. + enabled: false + # -- Minimum autoscaling replicas for the write. + minReplicas: 2 + # -- Maximum autoscaling replicas for the write. + maxReplicas: 6 + # -- Target CPU utilisation percentage for the write. + targetCPUUtilizationPercentage: 60 + # -- Target memory utilization percentage for the write. + targetMemoryUtilizationPercentage: + # -- Behavior policies while scaling. + behavior: + # -- see https://github.com/grafana/loki/blob/main/docs/sources/operations/storage/wal.md#how-to-scale-updown for scaledown details + scaleUp: + policies: + - type: Pods + value: 1 + periodSeconds: 900 + scaleDown: + policies: + - type: Pods + value: 1 + periodSeconds: 1800 + stabilizationWindowSeconds: 3600 + image: + # -- The Docker registry for the write image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the write image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the write image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for write pods + priorityClassName: null + # -- Annotations for write StatefulSet + annotations: {} + # -- Annotations for write pods + podAnnotations: {} + # -- Additional labels for each `write` pod + podLabels: {} + # -- Additional selector labels for each `write` pod + selectorLabels: {} + service: + # -- Annotations for write Service + annotations: {} + # -- Additional labels for write Service + labels: {} + # -- Service Type for write Service + type: "ClusterIP" + # -- Comma-separated list of Loki modules to load for the write + targetModule: "write" + # -- Additional CLI args for the write + extraArgs: [] + # -- Environment variables to add to the write pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the write pods + extraEnvFrom: [] + # -- Lifecycle for the write container + lifecycle: {} + # -- The default /flush_shutdown preStop hook is recommended as part of the ingester + # scaledown process so it's added to the template by default when autoscaling is enabled, + # but it's disabled to optimize rolling restarts in instances that will never be scaled + # down or when using chunks storage with WAL disabled. + # https://github.com/grafana/loki/blob/main/docs/sources/operations/storage/wal.md#how-to-scale-updown + # -- Init containers to add to the write pods + initContainers: [] + # -- Containers to add to the write pods + extraContainers: [] + # -- Volume mounts to add to the write pods + extraVolumeMounts: [] + # -- Volumes to add to the write pods + extraVolumes: [] + # -- volumeClaimTemplates to add to StatefulSet + extraVolumeClaimTemplates: [] + # -- Resource requests and limits for the write + resources: {} + # -- Grace period to allow the write to shutdown before it is killed. Especially for the ingester, + # this must be increased. It must be long enough so writes can be gracefully shutdown flushing/transferring + # all data and to successfully leave the member ring on shutdown. + terminationGracePeriodSeconds: 300 + # -- Use the host's user namespace in the write pods. + hostUsers: nil + # -- Affinity for write pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: write + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- DNS config for write pods + dnsConfig: {} + # -- Node selector for write pods + nodeSelector: {} + # -- Topology Spread Constraints for write pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for write pods + tolerations: [] + # -- The default is to deploy all pods in parallel. + podManagementPolicy: "Parallel" + persistence: + # -- Enable volume claims in pod spec + volumeClaimsEnabled: true + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + # -- Parameters used for the `data` volume when volumeClaimEnabled if false + dataVolumeParameters: + emptyDir: {} + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: false + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Selector for persistent disk + selector: null + # -- Annotations for volume claim + annotations: {} + # -- Labels for volume claim + labels: {} +# -- Configuration for the read pod(s) +read: + # -- Number of replicas for the read + replicas: 3 + autoscaling: + # -- Enable autoscaling for the read, this is only used if `queryIndex.enabled: true` + enabled: false + # -- Minimum autoscaling replicas for the read + minReplicas: 2 + # -- Maximum autoscaling replicas for the read + maxReplicas: 6 + # -- Target CPU utilisation percentage for the read + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the read + targetMemoryUtilizationPercentage: + # -- Behavior policies while scaling. + behavior: {} + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 60 + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + image: + # -- The Docker registry for the read image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the read image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the read image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for read pods + priorityClassName: null + # -- Annotations for read deployment + annotations: {} + # -- Annotations for read pods + podAnnotations: {} + # -- Additional labels for each `read` pod + podLabels: {} + # -- Additional selector labels for each `read` pod + selectorLabels: {} + service: + # -- Annotations for read Service + annotations: {} + # -- Additional labels for read Service + labels: {} + # -- Service Type for read Service + type: ClusterIP + # -- Comma-separated list of Loki modules to load for the read + targetModule: "read" + # -- Whether or not to use the 2 target type simple scalable mode (read, write) or the + # 3 target type (read, write, backend). Legacy refers to the 2 target type, so true will + # run two targets, false will run 3 targets. + legacyReadTarget: false + # -- Additional CLI args for the read + extraArgs: [] + # -- init containers to add to the read pods + initContainers: [] + # -- Containers to add to the read pods + extraContainers: [] + # -- Environment variables to add to the read pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the read pods + extraEnvFrom: [] + # -- Lifecycle for the read container + lifecycle: {} + # -- Volume mounts to add to the read pods + extraVolumeMounts: [] + # -- Volumes to add to the read pods + extraVolumes: [] + # -- Resource requests and limits for the read + resources: {} + # -- liveness probe settings for read pods. If empty, applies no livenessProbe + livenessProbe: {} + # -- Grace period to allow the read to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Use the host's user namespace in the read pods. + hostUsers: nil + # -- Affinity for read pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: read + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- DNS config for read pods + dnsConfig: {} + # -- Node selector for read pods + nodeSelector: {} + # -- Topology Spread Constraints for read pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for read pods + tolerations: [] + # -- The default is to deploy all pods in parallel. + podManagementPolicy: "Parallel" + # -- read.persistence is used only if legacyReadTarget is set to true + persistence: + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: true + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Selector for persistent disk + selector: null + # -- Annotations for volume claim + annotations: {} + # -- Labels for volume claim + labels: {} +# -- Configuration for the backend pod(s) +backend: + # -- Number of replicas for the backend + replicas: 3 + autoscaling: + # -- Enable autoscaling for the backend. + enabled: false + # -- Minimum autoscaling replicas for the backend. + minReplicas: 3 + # -- Maximum autoscaling replicas for the backend. + maxReplicas: 6 + # -- Target CPU utilization percentage for the backend. + targetCPUUtilizationPercentage: 60 + # -- Target memory utilization percentage for the backend. + targetMemoryUtilizationPercentage: + # -- Behavior policies while scaling. + behavior: {} + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 60 + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + image: + # -- The Docker registry for the backend image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the backend image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the backend image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for backend pods + priorityClassName: null + # -- Annotations for backend StatefulSet + annotations: {} + # -- Annotations for backend pods + podAnnotations: {} + # -- Additional labels for each `backend` pod + podLabels: {} + # -- Additional selector labels for each `backend` pod + selectorLabels: {} + service: + # -- Annotations for backend Service + annotations: {} + # -- Additional labels for backend Service + labels: {} + # -- Service type for backend Service + type: ClusterIP + # -- Comma-separated list of Loki modules to load for the backend + targetModule: "backend" + # -- Additional CLI args for the backend + extraArgs: [] + # -- Environment variables to add to the backend pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the backend pods + extraEnvFrom: [] + # -- Init containers to add to the backend pods + initContainers: [] + # -- Containers to add to the backend pods + extraContainers: [] + # -- Volume mounts to add to the backend pods + extraVolumeMounts: [] + # -- Volumes to add to the backend pods + extraVolumes: [] + # -- Resource requests and limits for the backend + resources: {} + # -- Grace period to allow the backend to shutdown before it is killed. Especially for the ingester, + # this must be increased. It must be long enough so backends can be gracefully shutdown flushing/transferring + # all data and to successfully leave the member ring on shutdown. + terminationGracePeriodSeconds: 300 + # -- Use the host's user namespace in the backend pods. + hostUsers: nil + # -- Affinity for backend pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: backend + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- DNS config for backend pods + dnsConfig: {} + # -- Node selector for backend pods + nodeSelector: {} + # -- Topology Spread Constraints for backend pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for backend pods + tolerations: [] + # -- The default is to deploy all pods in parallel. + podManagementPolicy: "Parallel" + persistence: + # -- Enable volume claims in pod spec + volumeClaimsEnabled: true + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + # -- Parameters used for the `data` volume when volumeClaimEnabled if false + dataVolumeParameters: + emptyDir: {} + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: true + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Selector for persistent disk + selector: null + # -- Annotations for volume claim + annotations: {} + # -- Labels for volume claim + labels: {} +###################################################################################################################### +# +# Microservices Mode +# +# For large Loki deployments ingesting more than 1 TB/day +# +###################################################################################################################### + +# -- Configuration for the ingester +ingester: + # -- Number of replicas for the ingester, when zoneAwareReplication.enabled is true, the total + # number of replicas will match this value with each zone having 1/3rd of the total replicas. + replicas: 0 + # -- DNSConfig for ingester pods + dnsConfig: {} + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the ingester + hostUsers: nil + autoscaling: + # -- Enable autoscaling for the ingester + enabled: false + # -- Minimum autoscaling replicas for the ingester + minReplicas: 1 + # -- Maximum autoscaling replicas for the ingester + maxReplicas: 3 + # -- Target CPU utilisation percentage for the ingester + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the ingester + targetMemoryUtilizationPercentage: null + # -- Allows one to define custom metrics using the HPA/v2 schema (for example, Pods, Object or External metrics) + customMetrics: [] + # - type: Pods + # pods: + # metric: + # name: loki_lines_total + # target: + # type: AverageValue + # averageValue: 10k + behavior: + # -- Enable autoscaling behaviours + enabled: false + # -- define scale down policies, must conform to HPAScalingRules + scaleDown: {} + # -- define scale up policies, must conform to HPAScalingRules + scaleUp: {} + image: + # -- The Docker registry for the ingester image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the ingester image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the ingester image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + labels: {} + priorityClassName: null + # -- Labels for ingester pods + podLabels: {} + # -- Annotations for ingester pods + podAnnotations: {} + # -- The name of the PriorityClass for ingester pods + # -- Labels for ingestor service + serviceLabels: {} + # -- Annotations for ingestor service + serviceAnnotations: {} + # -- Service type for ingestor service + serviceType: "ClusterIP" + # -- Additional CLI args for the ingester + extraArgs: [] + # -- Environment variables to add to the ingester pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the ingester pods + extraEnvFrom: [] + # -- Volume mounts to add to the ingester pods + extraVolumeMounts: [] + # -- Volumes to add to the ingester pods + extraVolumes: [] + # -- Resource requests and limits for the ingester + resources: {} + # -- Containers to add to the ingester pods + extraContainers: [] + # -- Init containers to add to the ingester pods + initContainers: [] + # -- Grace period to allow the ingester to shutdown before it is killed. Especially for the ingestor, + # this must be increased. It must be long enough so ingesters can be gracefully shutdown flushing/transferring + # all data and to successfully leave the member ring on shutdown. + terminationGracePeriodSeconds: 300 + # -- Lifecycle for the ingester container + lifecycle: {} + # -- topologySpread for ingester pods. + # @default -- Defaults to allow skew no more than 1 node + # The value will be passed through tpl. + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/component: ingester + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + # -- Affinity for ingester pods. Ignored if zoneAwareReplication is enabled. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: ingester + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: 1 + # -- Node selector for ingester pods + nodeSelector: {} + # -- Tolerations for ingester pods + tolerations: [] + # -- readiness probe settings for ingester pods. If empty, use `loki.readinessProbe` + readinessProbe: {} + # -- liveness probe settings for ingester pods. If empty use `loki.livenessProbe` + livenessProbe: {} + # -- UpdateStrategy for the ingester StatefulSets. + updateStrategy: + # -- One of 'OnDelete' or 'RollingUpdate' + type: RollingUpdate + # -- Optional for updateStrategy.type=RollingUpdate. See [Partitioned rolling updates](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#partitions) in the StatefulSet docs for details. + # rollingUpdate: + # partition: 0 + persistence: + # -- Enable creating PVCs which is required when using boltdb-shipper + enabled: false + # -- Use emptyDir with ramdisk for storage. **Please note that all data in ingester will be lost on pod restart** + inMemory: false + # -- List of the ingester PVCs + # @notationType -- list + claims: + - name: data + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # - name: wal + # size: 150Gi + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: false + whenDeleted: Retain + whenScaled: Retain + # -- Adds the appProtocol field to the ingester service. This allows ingester to work with istio protocol selection. + appProtocol: + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + grpc: "" + # -- Enabling zone awareness on ingesters will create 3 statefulests where all writes will send a replica to each zone. + # This is primarily intended to accelerate rollout operations by allowing for multiple ingesters within a single + # zone to be shutdown and restart simultaneously (the remaining 2 zones will be guaranteed to have at least one copy + # of the data). + # Note: This can be used to run Loki over multiple cloud provider availability zones however this is not currently + # recommended as Loki is not optimized for this and cross zone network traffic costs can become extremely high + # extremely quickly. Even with zone awareness enabled, it is recommended to run Loki in a single availability zone. + zoneAwareReplication: + # -- Enable zone awareness. + enabled: true + # -- The percent of replicas in each zone that will be restarted at once. In a value of 0-100 + maxUnavailablePct: 33 + # -- zoneA configuration + zoneA: + # -- optionally define a node selector for this zone + nodeSelector: null + # -- optionally define extra affinity rules, by default different zones are not allowed to schedule on the same host + # The value will be passed through tpl. + extraAffinity: {} + # -- Specific annotations to add to zone A statefulset + annotations: {} + # -- Specific annotations to add to zone A pods + podAnnotations: {} + zoneB: + # -- optionally define a node selector for this zone + nodeSelector: null + # -- optionally define extra affinity rules, by default different zones are not allowed to schedule on the same host + # The value will be passed through tpl. + extraAffinity: {} + # -- Specific annotations to add to zone B statefulset + annotations: {} + # -- Specific annotations to add to zone B pods + podAnnotations: {} + zoneC: + # -- optionally define a node selector for this zone + nodeSelector: null + # -- optionally define extra affinity rules, by default different zones are not allowed to schedule on the same host + # The value will be passed through tpl. + extraAffinity: {} + # -- Specific annotations to add to zone C statefulset + annotations: {} + # -- Specific annotations to add to zone C pods + podAnnotations: {} + # -- The migration block allows migrating non zone aware ingesters to zone aware ingesters. + migration: + enabled: false + excludeDefaultZone: false + readPath: false + writePath: false + + # optionally allow adding arbitrary prefix to the ingester rollout-group label + rolloutGroupPrefix: null + # optionally allow adding 'loki-' prefix to ingester name label + addIngesterNamePrefix: false + +# -- Configuration for the distributor +distributor: + # -- Number of replicas for the distributor + replicas: 0 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the distributor + hostUsers: nil + # -- DNSConfig for distributor pods + dnsConfig: {} + autoscaling: + # -- Enable autoscaling for the distributor + enabled: false + # -- Minimum autoscaling replicas for the distributor + minReplicas: 1 + # -- Maximum autoscaling replicas for the distributor + maxReplicas: 3 + # -- Target CPU utilisation percentage for the distributor + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the distributor + targetMemoryUtilizationPercentage: null + # -- Allows one to define custom metrics using the HPA/v2 schema (for example, Pods, Object or External metrics) + customMetrics: [] + # - type: Pods + # pods: + # metric: + # name: loki_lines_total + # target: + # type: AverageValue + # averageValue: 10k + behavior: + # -- Enable autoscaling behaviours + enabled: false + # -- define scale down policies, must conform to HPAScalingRules + scaleDown: {} + # -- define scale up policies, must conform to HPAScalingRules + scaleUp: {} + image: + # -- The Docker registry for the distributor image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the distributor image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the distributor image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for distributor pods + priorityClassName: null + # -- Labels for distributor pods + podLabels: {} + # -- Annotations for distributor pods + podAnnotations: {} + # -- Labels for distributor service + serviceLabels: {} + # -- Annotations for distributor service + serviceAnnotations: {} + # -- Service type for distributor service + serviceType: ClusterIP + # -- Additional CLI args for the distributor + extraArgs: [] + # -- Environment variables to add to the distributor pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the distributor pods + extraEnvFrom: [] + # -- Volume mounts to add to the distributor pods + extraVolumeMounts: [] + # -- Volumes to add to the distributor pods + extraVolumes: [] + # -- Resource requests and limits for the distributor + resources: {} + # -- Init containers to add to the distributor pods + initContainers: [] + # -- Containers to add to the distributor pods + extraContainers: [] + # -- Grace period to allow the distributor to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for distributor pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: distributor + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: null + # -- Max Surge for distributor pods + maxSurge: 0 + # -- Node selector for distributor pods + nodeSelector: {} + # -- Topology Spread Constraints for distributor pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for distributor pods + tolerations: [] + # -- Adds the appProtocol field to the distributor service. This allows distributor to work with istio protocol selection. + appProtocol: + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + grpc: "" + # -- trafficDistribution for distributor service + trafficDistribution: "" +# -- Configuration for the querier +querier: + # -- Number of replicas for the querier + replicas: 0 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the querier + hostUsers: nil + autoscaling: + # -- Enable autoscaling for the querier, this is only used if `indexGateway.enabled: true` + enabled: false + # -- Minimum autoscaling replicas for the querier + minReplicas: 1 + # -- Maximum autoscaling replicas for the querier + maxReplicas: 3 + # -- Target CPU utilisation percentage for the querier + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the querier + targetMemoryUtilizationPercentage: null + # -- Allows one to define custom metrics using the HPA/v2 schema (for example, Pods, Object or External metrics) + customMetrics: [] + # - type: External + # external: + # metric: + # name: loki_inflight_queries + # target: + # type: AverageValue + # averageValue: 12 + behavior: + # -- Enable autoscaling behaviours + enabled: false + # -- define scale down policies, must conform to HPAScalingRules + scaleDown: {} + # -- define scale up policies, must conform to HPAScalingRules + scaleUp: {} + image: + # -- The Docker registry for the querier image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the querier image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the querier image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for querier pods + priorityClassName: null + # -- Labels for querier pods + podLabels: {} + # -- Annotations for querier pods + podAnnotations: {} + # -- Labels for querier service + serviceLabels: {} + # -- Annotations for querier service + serviceAnnotations: {} + # -- Service Type for querier service + serviceType: "ClusterIP" + # -- Additional CLI args for the querier + extraArgs: [] + # -- Environment variables to add to the querier pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the querier pods + extraEnvFrom: [] + # -- Volume mounts to add to the querier pods + extraVolumeMounts: [] + # -- Volumes to add to the querier pods + extraVolumes: [] + # -- Resource requests and limits for the querier + resources: {} + # -- Containers to add to the querier pods + extraContainers: [] + # -- Init containers to add to the querier pods + initContainers: [] + # -- Grace period to allow the querier to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- topologySpread for querier pods. + # @default -- Defaults to allow skew no more then 1 node + # The value will be passed through tpl. + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/component: querier + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + # -- Affinity for querier pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: querier + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: null + # -- Max Surge for querier pods + maxSurge: 0 + # -- Node selector for querier pods + nodeSelector: {} + # -- Tolerations for querier pods + tolerations: [] + # -- DNSConfig for querier pods + dnsConfig: {} + # -- Adds the appProtocol field to the querier service. This allows querier to work with istio protocol selection. + appProtocol: + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + grpc: "" +# -- Configuration for the query-frontend +queryFrontend: + # -- Number of replicas for the query-frontend + replicas: 0 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the query-frontend + hostUsers: nil + autoscaling: + # -- Enable autoscaling for the query-frontend + enabled: false + # -- Minimum autoscaling replicas for the query-frontend + minReplicas: 1 + # -- Maximum autoscaling replicas for the query-frontend + maxReplicas: 3 + # -- Target CPU utilisation percentage for the query-frontend + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the query-frontend + targetMemoryUtilizationPercentage: null + # -- Allows one to define custom metrics using the HPA/v2 schema (for example, Pods, Object or External metrics) + customMetrics: [] + # - type: Pods + # pods: + # metric: + # name: loki_query_rate + # target: + # type: AverageValue + # averageValue: 100 + behavior: + # -- Enable autoscaling behaviours + enabled: false + # -- define scale down policies, must conform to HPAScalingRules + scaleDown: {} + # -- define scale up policies, must conform to HPAScalingRules + scaleUp: {} + image: + # -- The Docker registry for the query-frontend image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the query-frontend image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the query-frontend image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for query-frontend pods + priorityClassName: null + # -- Labels for query-frontend pods + podLabels: {} + # -- Annotations for query-frontend pods + podAnnotations: {} + # -- Labels for query-frontend service + serviceLabels: {} + # -- Annotations for query-frontend service + serviceAnnotations: {} + # -- Service Type for query-frontend service + serviceType: ClusterIP + # -- Additional CLI args for the query-frontend + extraArgs: [] + # -- Environment variables to add to the query-frontend pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the query-frontend pods + extraEnvFrom: [] + # -- Volume mounts to add to the query-frontend pods + extraVolumeMounts: [] + # -- Volumes to add to the query-frontend pods + extraVolumes: [] + # -- Resource requests and limits for the query-frontend + resources: {} + # -- init containers to add to the query-frontend pods + initContainers: [] + # -- Containers to add to the query-frontend pods + extraContainers: [] + # -- Grace period to allow the query-frontend to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for query-frontend pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: query-frontend + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: null + # -- Node selector for query-frontend pods + nodeSelector: {} + # -- Topology Spread Constraints for query-frontend pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for query-frontend pods + tolerations: [] + # -- Adds the appProtocol field to the queryFrontend service. This allows queryFrontend to work with istio protocol selection. + appProtocol: + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + grpc: "" +# -- Configuration for the query-scheduler +queryScheduler: + # -- Number of replicas for the query-scheduler. + # It should be lower than `-querier.max-concurrent` to avoid generating back-pressure in queriers; + # it's also recommended that this value evenly divides the latter + replicas: 0 + # -- DNSConfig for query-scheduler + dnsConfig: {} + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the query-scheduler + hostUsers: nil + image: + # -- The Docker registry for the query-scheduler image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the query-scheduler image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the query-scheduler image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for query-scheduler pods + priorityClassName: null + # -- Labels for query-scheduler pods + podLabels: {} + # -- Annotations for query-scheduler pods + podAnnotations: {} + # -- Labels for query-scheduler service + serviceLabels: {} + # -- Annotations for query-scheduler service + serviceAnnotations: {} + # -- Additional CLI args for the query-scheduler + extraArgs: [] + # -- Environment variables to add to the query-scheduler pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the query-scheduler pods + extraEnvFrom: [] + # -- Volume mounts to add to the query-scheduler pods + extraVolumeMounts: [] + # -- Volumes to add to the query-scheduler pods + extraVolumes: [] + # -- Resource requests and limits for the query-scheduler + resources: {} + # -- init containers to add to the query-scheduler pods + initContainers: [] + # -- Containers to add to the query-scheduler pods + extraContainers: [] + # -- Grace period to allow the query-scheduler to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for query-scheduler pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: query-scheduler + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: 1 + # -- Node selector for query-scheduler pods + nodeSelector: {} + # -- Topology Spread Constraints for query-scheduler pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for query-scheduler pods + tolerations: [] + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + appProtocol: + grpc: "" +# -- Configuration for the index-gateway +indexGateway: + # -- Number of replicas for the index-gateway + replicas: 0 + # -- Whether the index gateway should join the memberlist hashring + joinMemberlist: true + # -- DNSConfig for index-gateway pods + dnsConfig: {} + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the index-gateway + hostUsers: nil + image: + # -- The Docker registry for the index-gateway image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the index-gateway image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the index-gateway image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for index-gateway pods + priorityClassName: null + # -- Labels for index-gateway pods + podLabels: {} + # -- Annotations for index-gateway pods + podAnnotations: {} + # -- Labels for index-gateway service + serviceLabels: {} + # -- Annotations for index-gateway service + serviceAnnotations: {} + # -- Service type for index-gateway service + serviceType: "ClusterIP" + # -- Additional CLI args for the index-gateway + extraArgs: [] + # -- Environment variables to add to the index-gateway pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the index-gateway pods + extraEnvFrom: [] + # -- Volume mounts to add to the index-gateway pods + extraVolumeMounts: [] + # -- Volumes to add to the index-gateway pods + extraVolumes: [] + # -- Resource requests and limits for the index-gateway + resources: {} + # -- Containers to add to the index-gateway pods + extraContainers: [] + # -- Init containers to add to the index-gateway pods + initContainers: [] + # -- Grace period to allow the index-gateway to shutdown before it is killed. + terminationGracePeriodSeconds: 300 + # -- Lifecycle for the index-gateway container + lifecycle: {} + # -- Affinity for index-gateway pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: index-gateway + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: null + # -- Node selector for index-gateway pods + nodeSelector: {} + # -- Topology Spread Constraints for index-gateway pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for index-gateway pods + tolerations: [] + persistence: + # -- Enable creating PVCs which is required when using boltdb-shipper + enabled: false + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + # -- Use emptyDir with ramdisk for storage. **Please note that all data in indexGateway will be lost on pod restart** + inMemory: false + # -- Size of persistent or memory disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Annotations for index gateway PVCs + annotations: {} + # -- Labels for index gateway PVCs + labels: {} + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: false + whenDeleted: Retain + whenScaled: Retain + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + appProtocol: + grpc: "" + # -- UpdateStrategy for the indexGateway StatefulSet. + updateStrategy: + # -- One of 'OnDelete' or 'RollingUpdate' + type: RollingUpdate + # -- Optional for updateStrategy.type=RollingUpdate. See [Partitioned rolling updates](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#partitions) in the StatefulSet docs for details. + # rollingUpdate: + # partition: 0 +# -- Configuration for the compactor +compactor: + # -- Number of replicas for the compactor + replicas: 0 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the compactor + hostUsers: nil + # -- DNSConfig for compactor pods + dnsConfig: {} + image: + # -- The Docker registry for the compactor image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the compactor image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the compactor image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for compactor pods + priorityClassName: null + # -- Labels for compactor pods + podLabels: {} + # -- Annotations for compactor pods + podAnnotations: {} + # -- Affinity for compactor pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: compactor + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Labels for compactor service + serviceLabels: {} + # -- Annotations for compactor service + serviceAnnotations: {} + # -- Service type for compactor service + serviceType: "ClusterIP" + # -- Additional CLI args for the compactor + extraArgs: [] + # -- Environment variables to add to the compactor pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the compactor pods + extraEnvFrom: [] + # -- Volume mounts to add to the compactor pods + extraVolumeMounts: [] + # -- Volumes to add to the compactor pods + extraVolumes: [] + # -- readiness probe settings for ingester pods. If empty, use `loki.readinessProbe` + readinessProbe: {} + # -- liveness probe settings for ingester pods. If empty use `loki.livenessProbe` + livenessProbe: {} + # -- Resource requests and limits for the compactor + resources: {} + # -- Containers to add to the compactor pods + extraContainers: [] + # -- Init containers to add to the compactor pods + initContainers: [] + # -- Grace period to allow the compactor to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Node selector for compactor pods + nodeSelector: {} + # -- Tolerations for compactor pods + tolerations: [] + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + appProtocol: + grpc: "" + persistence: + # -- Enable creating PVCs for the compactor + enabled: false + # -- List of the compactor PVCs + # @notationType -- list + claims: + - name: data + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Annotations for compactor PVCs + annotations: {} + # -- Labels for compactor PVCs + labels: {} + # - name: wal + # size: 150Gi + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: false + whenDeleted: Retain + whenScaled: Retain + serviceAccount: + create: false + # -- The name of the ServiceAccount to use for the compactor. + # If not set and create is true, a name is generated by appending + # "-compactor" to the common ServiceAccount. + name: null + # -- Image pull secrets for the compactor service account + imagePullSecrets: [] + # -- Annotations for the compactor service account + annotations: {} + # -- Set this toggle to false to opt out of automounting API credentials for the service account + automountServiceAccountToken: true +# -- Configuration for the bloom-gateway +bloomGateway: + # -- Number of replicas for the bloom-gateway + replicas: 0 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the bloom-gateway + hostUsers: nil + # -- DNSConfig for bloom-gateway pods + dnsConfig: {} + image: + # -- The Docker registry for the bloom-gateway image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the bloom-gateway image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the bloom-gateway image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for bloom-gateway pods + priorityClassName: null + # -- Labels for bloom-gateway pods + podLabels: {} + # -- Annotations for bloom-gateway pods + podAnnotations: {} + # -- Affinity for bloom-gateway pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: bloom-gateway + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Labels for bloom-gateway service + serviceLabels: {} + # -- Annotations for bloom-gateway service + serviceAnnotations: {} + # -- Additional CLI args for the bloom-gateway + extraArgs: [] + # -- Environment variables to add to the bloom-gateway pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the bloom-gateway pods + extraEnvFrom: [] + # -- Volume mounts to add to the bloom-gateway pods + extraVolumeMounts: [] + # -- Volumes to add to the bloom-gateway pods + extraVolumes: [] + # -- readiness probe settings for ingester pods. If empty, use `loki.readinessProbe` + readinessProbe: {} + # -- liveness probe settings for ingester pods. If empty use `loki.livenessProbe` + livenessProbe: {} + # -- startup probe settings for ingester pods. If empty, use `loki.startupProbe` + startupProbe: {} + # -- Resource requests and limits for the bloom-gateway + resources: {} + # -- Containers to add to the bloom-gateway pods + extraContainers: [] + # -- Init containers to add to the bloom-gateway pods + initContainers: [] + # -- Grace period to allow the bloom-gateway to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Node selector for bloom-gateway pods + nodeSelector: {} + # -- Tolerations for bloom-gateway pods + tolerations: [] + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + appProtocol: + grpc: "" + persistence: + # -- Enable creating PVCs for the bloom-gateway + enabled: false + # -- Annotations for bloom-gateway PVCs + annotations: {} + # -- Labels for bloom gateway PVCs + labels: {} + # -- List of the bloom-gateway PVCs + # @notationType -- list + claims: + - name: data + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: false + whenDeleted: Retain + whenScaled: Retain + serviceAccount: + create: false + # -- The name of the ServiceAccount to use for the bloom-gateway. + # If not set and create is true, a name is generated by appending + # "-bloom-gateway" to the common ServiceAccount. + name: null + # -- Image pull secrets for the bloom-gateway service account + imagePullSecrets: [] + # -- Annotations for the bloom-gateway service account + annotations: {} + # -- Set this toggle to false to opt out of automounting API credentials for the service account + automountServiceAccountToken: true + +# -- Configuration for the bloom-planner +bloomPlanner: + # -- Number of replicas for the bloom-planner + replicas: 0 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the bloom-planner + hostUsers: nil + # -- DNSConfig for bloom-planner pods + dnsConfig: {} + image: + # -- The Docker registry for the bloom-planner image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the bloom-planner image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the bloom-planner image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for bloom-planner pods + priorityClassName: null + # -- Labels for bloom-planner pods + podLabels: {} + # -- Annotations for bloom-planner pods + podAnnotations: {} + # -- Affinity for bloom-planner pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: bloom-planner + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Labels for bloom-planner service + serviceLabels: {} + # -- Annotations for bloom-planner service + serviceAnnotations: {} + # -- Additional CLI args for the bloom-planner + extraArgs: [] + # -- Environment variables to add to the bloom-planner pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the bloom-planner pods + extraEnvFrom: [] + # -- Volume mounts to add to the bloom-planner pods + extraVolumeMounts: [] + # -- Volumes to add to the bloom-planner pods + extraVolumes: [] + # -- readiness probe settings for ingester pods. If empty, use `loki.readinessProbe` + readinessProbe: {} + # -- liveness probe settings for ingester pods. If empty use `loki.livenessProbe` + livenessProbe: {} + # -- startup probe settings for ingester pods. If empty use `loki.startupProbe` + startupProbe: {} + # -- Resource requests and limits for the bloom-planner + resources: {} + # -- Containers to add to the bloom-planner pods + extraContainers: [] + # -- Init containers to add to the bloom-planner pods + initContainers: [] + # -- Grace period to allow the bloom-planner to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Node selector for bloom-planner pods + nodeSelector: {} + # -- Tolerations for bloom-planner pods + tolerations: [] + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + appProtocol: + grpc: "" + persistence: + # -- Enable creating PVCs for the bloom-planner + enabled: false + # -- List of the bloom-planner PVCs + # @notationType -- list + claims: + - name: data + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Annotations for bloom-planner PVCs + annotations: {} + # -- Labels for bloom planner PVCs + labels: {} + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: false + whenDeleted: Retain + whenScaled: Retain + serviceAccount: + create: false + # -- The name of the ServiceAccount to use for the bloom-planner. + # If not set and create is true, a name is generated by appending + # "-bloom-planner" to the common ServiceAccount. + name: null + # -- Image pull secrets for the bloom-planner service account + imagePullSecrets: [] + # -- Annotations for the bloom-planner service account + annotations: {} + # -- Set this toggle to false to opt out of automounting API credentials for the service account + automountServiceAccountToken: true + +# -- Configuration for the bloom-builder +bloomBuilder: + # -- Number of replicas for the bloom-builder + replicas: 0 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the boom-builder + hostUsers: nil + # -- DNSConfig for bloom-builder pods + dnsConfig: {} + autoscaling: + # -- Enable autoscaling for the bloom-builder + enabled: false + # -- Minimum autoscaling replicas for the bloom-builder + minReplicas: 1 + # -- Maximum autoscaling replicas for the bloom-builder + maxReplicas: 3 + # -- Target CPU utilisation percentage for the bloom-builder + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the bloom-builder + targetMemoryUtilizationPercentage: null + # -- Allows one to define custom metrics using the HPA/v2 schema (for example, Pods, Object or External metrics) + customMetrics: [] + # - type: Pods + # pods: + # metric: + # name: loki_query_rate + # target: + # type: AverageValue + # averageValue: 100 + behavior: + # -- Enable autoscaling behaviours + enabled: false + # -- define scale down policies, must conform to HPAScalingRules + scaleDown: {} + # -- define scale up policies, must conform to HPAScalingRules + scaleUp: {} + image: + # -- The Docker registry for the bloom-builder image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the bloom-builder image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the bloom-builder image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for bloom-builder pods + priorityClassName: null + # -- Labels for bloom-builder pods + podLabels: {} + # -- Annotations for bloom-builder pods + podAnnotations: {} + # -- Labels for bloom-builder service + serviceLabels: {} + # -- Annotations for bloom-builder service + serviceAnnotations: {} + # -- Additional CLI args for the bloom-builder + extraArgs: [] + # -- Environment variables to add to the bloom-builder pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the bloom-builder pods + extraEnvFrom: [] + # -- Volume mounts to add to the bloom-builder pods + extraVolumeMounts: [] + # -- Volumes to add to the bloom-builder pods + extraVolumes: [] + # -- Resource requests and limits for the bloom-builder + resources: {} + # -- Init containers to add to the bloom-builder pods + initContainers: [] + # -- Containers to add to the bloom-builder pods + extraContainers: [] + # -- Grace period to allow the bloom-builder to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for bloom-builder pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: bloom-builder + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: null + # -- Node selector for bloom-builder pods + nodeSelector: {} + # -- Tolerations for bloom-builder pods + tolerations: [] + # -- Adds the appProtocol field to the queryFrontend service. This allows bloomBuilder to work with istio protocol selection. + appProtocol: + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + grpc: "" + +# -- Configuration for the pattern ingester +patternIngester: + # -- Number of replicas for the pattern ingester + replicas: 0 + # -- DNSConfig for pattern ingester pods + dnsConfig: {} + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the pattern ingester + hostUsers: nil + image: + # -- The Docker registry for the pattern ingester image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the pattern ingester image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the pattern ingester image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for pattern ingester pods + priorityClassName: null + # -- Labels for pattern ingester pods + podLabels: {} + # -- Annotations for pattern ingester pods + podAnnotations: {} + # -- Affinity for pattern ingester pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: pattern-ingester + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: null + # -- Labels for pattern ingester service + serviceLabels: {} + # -- Annotations for pattern ingester service + serviceAnnotations: {} + # -- Additional CLI args for the pattern ingester + extraArgs: [] + # -- Environment variables to add to the pattern ingester pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the pattern ingester pods + extraEnvFrom: [] + # -- Volume mounts to add to the pattern ingester pods + extraVolumeMounts: [] + # -- Volumes to add to the pattern ingester pods + extraVolumes: [] + # -- readiness probe settings for ingester pods. If empty, use `loki.readinessProbe` + readinessProbe: {} + # -- liveness probe settings for ingester pods. If empty use `loki.livenessProbe` + livenessProbe: {} + # -- Resource requests and limits for the pattern ingester + resources: {} + # -- Containers to add to the pattern ingester pods + extraContainers: [] + # -- Init containers to add to the pattern ingester pods + initContainers: [] + # -- Grace period to allow the pattern ingester to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Node selector for pattern ingester pods + nodeSelector: {} + # -- Topology Spread Constraints for pattern ingester pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for pattern ingester pods + tolerations: [] + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + appProtocol: + grpc: "" + persistence: + # -- Enable creating PVCs for the pattern ingester + enabled: false + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- List of the pattern ingester PVCs + # @notationType -- list + claims: + - name: data + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Annotations for pattern ingester PVCs + annotations: {} + # -- Labels for pattern ingester PVCs + labels: {} + # - name: wal + # size: 150Gi + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: false + whenDeleted: Retain + whenScaled: Retain + serviceAccount: + create: false + # -- The name of the ServiceAccount to use for the pattern ingester. + # If not set and create is true, a name is generated by appending + # "-pattern-ingester" to the common ServiceAccount. + name: null + # -- Image pull secrets for the pattern ingester service account + imagePullSecrets: [] + # -- Annotations for the pattern ingester service account + annotations: {} + # -- Set this toggle to false to opt out of automounting API credentials for the service account + automountServiceAccountToken: true +# -- Configuration for the ruler +ruler: + # -- The ruler component is optional and can be disabled if desired. + enabled: false + # -- Whether to enable the rules sidecar + sidecar: false + # -- Number of replicas for the ruler + replicas: 0 + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the ruler + hostUsers: nil + image: + # -- The Docker registry for the ruler image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the ruler image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the ruler image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for ruler pods + priorityClassName: null + # -- Labels for compactor pods + podLabels: {} + # -- Annotations for ruler pods + podAnnotations: {} + # -- Labels for ruler service + serviceLabels: {} + # -- Annotations for ruler service + serviceAnnotations: {} + # -- Additional CLI args for the ruler + extraArgs: [] + # -- Environment variables to add to the ruler pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the ruler pods + extraEnvFrom: [] + # -- Volume mounts to add to the ruler pods + extraVolumeMounts: [] + # -- Volumes to add to the ruler pods + extraVolumes: [] + # -- Resource requests and limits for the ruler + resources: {} + # -- Containers to add to the ruler pods + extraContainers: [] + # -- Init containers to add to the ruler pods + initContainers: [] + # -- Grace period to allow the ruler to shutdown before it is killed + terminationGracePeriodSeconds: 300 + # -- Affinity for ruler pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: ruler + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: null + # -- Node selector for ruler pods + nodeSelector: {} + # -- Topology Spread Constraints for ruler pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for ruler pods + tolerations: [] + # -- DNSConfig for ruler pods + dnsConfig: {} + persistence: + # -- Enable creating PVCs which is required when using recording rules + enabled: false + # -- Set access modes on the PersistentVolumeClaim + accessModes: + - ReadWriteOnce + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Annotations for ruler PVCs + annotations: {} + # -- Labels for ruler PVCs + labels: {} + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + appProtocol: + grpc: "" + # -- Directories containing rules files. If used, you must also configure `loki.rulerConfig.storage` to use local storage. + directories: {} + # tenant_foo: + # rules1.txt: | + # groups: + # - name: should_fire + # rules: + # - alert: HighPercentageError + # expr: | + # sum(rate({app="foo", env="production"} |= "error" [5m])) by (job) + # / + # sum(rate({app="foo", env="production"}[5m])) by (job) + # > 0.05 + # for: 10m + # labels: + # severity: warning + # annotations: + # summary: High error rate + # - name: credentials_leak + # rules: + # - alert: http-credentials-leaked + # annotations: + # message: "{{ $labels.job }} is leaking http basic auth credentials." + # expr: 'sum by (cluster, job, pod) (count_over_time({namespace="prod"} |~ "http(s?)://(\\w+):(\\w+)@" [5m]) > 0)' + # for: 10m + # labels: + # severity: critical + # rules2.txt: | + # groups: + # - name: example + # rules: + # - alert: HighThroughputLogStreams + # expr: sum by(container) (rate({job=~"loki-dev/.*"}[1m])) > 1000 + # for: 2m + # tenant_bar: + # rules1.txt: | + # groups: + # - name: should_fire + # rules: + # - alert: HighPercentageError + # expr: | + # sum(rate({app="foo", env="production"} |= "error" [5m])) by (job) + # / + # sum(rate({app="foo", env="production"}[5m])) by (job) + # > 0.05 + # for: 10m + # labels: + # severity: warning + # annotations: + # summary: High error rate + # - name: credentials_leak + # rules: + # - alert: http-credentials-leaked + # annotations: + # message: "{{ $labels.job }} is leaking http basic auth credentials." + # expr: 'sum by (cluster, job, pod) (count_over_time({namespace="prod"} |~ "http(s?)://(\\w+):(\\w+)@" [5m]) > 0)' + # for: 10m + # labels: + # severity: critical + # rules2.txt: | + # groups: + # - name: example + # rules: + # - alert: HighThroughputLogStreams + # expr: sum by(container) (rate({job=~"loki-dev/.*"}[1m])) > 1000 + # for: 2m + +# -- Configuration for the overrides-exporter +overridesExporter: + # -- The overrides-exporter component is optional and can be disabled if desired. + enabled: false + # -- Number of replicas for the overrides-exporter + replicas: 0 + # -- DNSConfig for overrides-exporter + dnsConfig: {} + # -- hostAliases to add + hostAliases: [] + # - ip: 1.2.3.4 + # hostnames: + # - domain.tld + # -- Use the host's user namespace in the overrides-exporter + hostUsers: nil + image: + # -- The Docker registry for the overrides-exporter image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the overrides-exporter image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the overrides-exporter image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for overrides-exporter pods + priorityClassName: null + # -- Labels for overrides-exporter pods + podLabels: {} + # -- Annotations for overrides-exporter pods + podAnnotations: {} + # -- Labels for overrides-exporter service + serviceLabels: {} + # -- Annotations for overrides-exporter service + serviceAnnotations: {} + # -- Additional CLI args for the overrides-exporter + extraArgs: [] + # -- Environment variables to add to the overrides-exporter pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the overrides-exporter pods + extraEnvFrom: [] + # -- Volume mounts to add to the overrides-exporter pods + extraVolumeMounts: [] + # -- Volumes to add to the overrides-exporter pods + extraVolumes: [] + # -- Resource requests and limits for the overrides-exporter + resources: {} + # -- Containers to add to the overrides-exporter pods + extraContainers: [] + # -- Init containers to add to the overrides-exporter pods + initContainers: [] + # -- Grace period to allow the overrides-exporter to shutdown before it is killed + terminationGracePeriodSeconds: 300 + # -- Affinity for overrides-exporter pods. + # @default -- Hard node anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: overrides-exporter + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: null + # -- Node selector for overrides-exporter pods + nodeSelector: {} + # -- Topology Spread Constraints for overrides-exporter pods + # The value will be passed through tpl. + topologySpreadConstraints: [] + # -- Tolerations for overrides-exporter pods + tolerations: [] + # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" + appProtocol: + grpc: "" + +# You can use a self hosted memcached by setting enabled to false and providing addresses. +memcached: + # -- Enable the built in memcached server provided by the chart + enabled: true + image: + # -- Memcached Docker image repository + repository: memcached + # -- Memcached Docker image tag + tag: 1.6.39-alpine + # -- Memcached Docker image pull policy + pullPolicy: IfNotPresent + # -- The SecurityContext override for memcached pods + podSecurityContext: + runAsNonRoot: true + runAsUser: 11211 + runAsGroup: 11211 + fsGroup: 11211 + # -- The name of the PriorityClass for memcached pods + priorityClassName: null + # -- The SecurityContext for memcached containers + containerSecurityContext: + readOnlyRootFilesystem: true + capabilities: + drop: [ALL] + allowPrivilegeEscalation: false + # -- Readiness probe for memcached pods (probe port defaults to container port) + readinessProbe: + tcpSocket: + port: client + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 6 + # -- Liveness probe for memcached pods + livenessProbe: + tcpSocket: + port: client + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + # -- Startup probe for memcached pods + startupProbe: {} + +memcachedExporter: + # -- Whether memcached metrics should be exported + enabled: true + image: + repository: prom/memcached-exporter + tag: v0.15.3 + pullPolicy: IfNotPresent + resources: + requests: {} + limits: {} + # -- The SecurityContext for memcached exporter containers + containerSecurityContext: + readOnlyRootFilesystem: true + capabilities: + drop: [ALL] + allowPrivilegeEscalation: false + # -- Extra args to add to the exporter container. + # Example: + # extraArgs: + # memcached.tls.enable: true + # memcached.tls.cert-file: /certs/cert.crt + # memcached.tls.key-file: /certs/cert.key + # memcached.tls.ca-file: /certs/ca.crt + # memcached.tls.insecure-skip-verify: false + # memcached.tls.server-name: memcached + extraArgs: {} + # -- Liveness probe for memcached exporter + livenessProbe: + httpGet: + path: /metrics + port: http-metrics + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + # -- Readiness probe for memcached exporter + readinessProbe: + httpGet: + path: /metrics + port: http-metrics + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + # -- Startup probe for memcached exporter + startupProbe: {} + +resultsCache: + # -- Specifies whether memcached based results-cache should be enabled + enabled: true + # -- Comma separated addresses list in DNS Service Discovery format + addresses: dnssrvnoa+_memcached-client._tcp.{{ include "loki.resourceName" (dict "ctx" $ "component" "results-cache") }}.{{ include "loki.namespace" $ }}.svc + # -- Specify how long cached results should be stored in the results-cache before being expired + defaultValidity: 12h + # -- Memcached operation timeout + timeout: 500ms + # -- Total number of results-cache replicas + replicas: 1 + # -- Port of the results-cache service + port: 11211 + # -- Amount of memory allocated to results-cache for object storage (in MB). + allocatedMemory: 1024 + # -- Maximum item results-cache for memcached (in MB). + maxItemMemory: 5 + # -- Maximum number of connections allowed + connectionLimit: 16384 + # -- Max memory to use for cache write back + writebackSizeLimit: 500MB + # -- Max number of objects to use for cache write back + writebackBuffer: 500000 + # -- Number of parallel threads for cache write back + writebackParallelism: 1 + # -- Extra init containers for results-cache pods + initContainers: [] + # -- Annotations for the results-cache pods + annotations: {} + # -- Node selector for results-cache pods + nodeSelector: {} + # -- Affinity for results-cache pods + affinity: {} + # -- topologySpreadConstraints allows to customize the default topologySpreadConstraints. This can be either a single dict as shown below or a slice of topologySpreadConstraints. + # labelSelector is taken from the constraint itself (if it exists) or is generated by the chart using the same selectors as for services. + topologySpreadConstraints: [] + # maxSkew: 1 + # topologyKey: kubernetes.io/hostname + # whenUnsatisfiable: ScheduleAnyway + # -- Tolerations for results-cache pods + tolerations: [] + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: 1 + # -- DNSConfig for results-cache + dnsConfig: {} + # -- The name of the PriorityClass for results-cache pods + priorityClassName: null + # -- Use the host's user namespace in results-cache pods + hostUsers: nil + # -- Labels for results-cache pods + podLabels: {} + # -- Annotations for results-cache pods + podAnnotations: {} + # -- Management policy for results-cache pods + podManagementPolicy: Parallel + # -- Grace period to allow the results-cache to shutdown before it is killed + terminationGracePeriodSeconds: 60 + # -- Stateful results-cache strategy + statefulStrategy: + type: RollingUpdate + # -- Add extended options for results-cache memcached container. The format is the same as for the memcached -o/--extend flag. + # Example: + # extraExtendedOptions: 'tls,modern,track_sizes' + extraExtendedOptions: "" + # -- Additional CLI args for results-cache + extraArgs: {} + # -- Additional containers to be added to the results-cache pod. + extraContainers: [] + # -- Additional volumes to be added to the results-cache pod (applies to both memcached and exporter containers). + # Example: + # extraVolumes: + # - name: extra-volume + # secret: + # secretName: extra-volume-secret + extraVolumes: [] + # -- Additional volume mounts to be added to the results-cache pod (applies to both memcached and exporter containers). + # Example: + # extraVolumeMounts: + # - name: extra-volume + # mountPath: /etc/extra-volume + # readOnly: true + extraVolumeMounts: [] + # -- Resource requests and limits for the results-cache + # By default a safe memory limit will be requested based on allocatedMemory value (floor (* 1.2 allocatedMemory)). + resources: null + # -- Service annotations and labels + service: + annotations: {} + labels: {} + # -- Persistence settings for the results-cache + persistence: + # -- Enable creating PVCs for the results-cache + enabled: false + # -- Size of persistent disk, must be in G or Gi + storageSize: 10G + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Volume mount path + mountPath: /data + # -- PVC additional labels + labels: {} +chunksCache: + # -- Append to the name of the resources to make names different for l1 and l2 + suffix: "" + # -- Specifies whether memcached based chunks-cache should be enabled + enabled: true + # -- Comma separated addresses list in DNS Service Discovery format + addresses: dnssrvnoa+_memcached-client._tcp.{{ include "loki.resourceName" (dict "ctx" $ "component" "chunks-cache" "suffix" $.Values.chunksCache.suffix ) }}.{{ include "loki.namespace" $ }}.svc + # -- Batchsize for sending and receiving chunks from chunks cache + batchSize: 4 + # -- Parallel threads for sending and receiving chunks from chunks cache + parallelism: 5 + # -- Memcached operation timeout + timeout: 2000ms + # -- Specify how long cached chunks should be stored in the chunks-cache before being expired + defaultValidity: 0s + # -- Specify how long cached chunks should be stored in the chunks-cache before being expired + replicas: 1 + # -- Port of the chunks-cache service + port: 11211 + # -- Amount of memory allocated to chunks-cache for object storage (in MB). + allocatedMemory: 8192 + # -- Maximum item memory for chunks-cache (in MB). + maxItemMemory: 5 + # -- Maximum number of connections allowed + connectionLimit: 16384 + # -- Max memory to use for cache write back + writebackSizeLimit: 500MB + # -- Max number of objects to use for cache write back + writebackBuffer: 500000 + # -- Number of parallel threads for cache write back + writebackParallelism: 1 + # -- Extra init containers for chunks-cache pods + initContainers: [] + # -- Annotations for the chunks-cache pods + annotations: {} + # -- Node selector for chunks-cache pods + nodeSelector: {} + # -- Affinity for chunks-cache pods + affinity: {} + # -- topologySpreadConstraints allows to customize the default topologySpreadConstraints. This can be either a single dict as shown below or a slice of topologySpreadConstraints. + # labelSelector is taken from the constraint itself (if it exists) or is generated by the chart using the same selectors as for services. + topologySpreadConstraints: [] + # maxSkew: 1 + # topologyKey: kubernetes.io/hostname + # whenUnsatisfiable: ScheduleAnyway + # -- Tolerations for chunks-cache pods + tolerations: [] + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: 1 + # -- DNSConfig for chunks-cache + dnsConfig: {} + # -- The name of the PriorityClass for chunks-cache pods + priorityClassName: null + # -- Use the host's user namespace in chunks-cache pods + hostUsers: nil + # -- Labels for chunks-cache pods + podLabels: {} + # -- Annotations for chunks-cache pods + podAnnotations: {} + # -- Management policy for chunks-cache pods + podManagementPolicy: Parallel + # -- Grace period to allow the chunks-cache to shutdown before it is killed + terminationGracePeriodSeconds: 60 + # -- Stateful chunks-cache strategy + statefulStrategy: + type: RollingUpdate + # -- Add extended options for chunks-cache memcached container. The format is the same as for the memcached -o/--extend flag. + # Example: + # extraExtendedOptions: 'tls,no_hashexpand' + extraExtendedOptions: "" + # -- Additional CLI args for chunks-cache + extraArgs: {} + # -- Additional containers to be added to the chunks-cache pod. + extraContainers: [] + # -- Additional volumes to be added to the chunks-cache pod (applies to both memcached and exporter containers). + # Example: + # extraVolumes: + # - name: extra-volume + # secret: + # secretName: extra-volume-secret + extraVolumes: [] + # -- Additional volume mounts to be added to the chunks-cache pod (applies to both memcached and exporter containers). + # Example: + # extraVolumeMounts: + # - name: extra-volume + # mountPath: /etc/extra-volume + # readOnly: true + extraVolumeMounts: [] + # -- Resource requests and limits for the chunks-cache + # By default a safe memory limit will be requested based on allocatedMemory value (floor (* 1.2 allocatedMemory)). + resources: null + # -- Service annotations and labels + service: + annotations: {} + labels: {} + # -- Persistence settings for the chunks-cache + persistence: + # -- Enable creating PVCs for the chunks-cache + enabled: false + # -- Size of persistent disk, must be in G or Gi + storageSize: 10G + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Volume mount path + mountPath: /data + labels: {} + # -- l2 memcache configuration + l2: + # -- Append to the name of the resources to make names different for l1 and l2 + suffix: "l2" + # -- The age of chunks should be transfered from l1 cache to l2 + # 4 days + l2ChunkCacheHandoff: 345600s + # -- Specifies whether memcached based chunks-cache-l2 should be enabled + enabled: false + # -- Comma separated addresses list in DNS Service Discovery format + addresses: 'dnssrvnoa+_memcached-client._tcp.{{ include "loki.resourceName" (dict "ctx" $ "component" "chunks-cache" "suffix" $.Values.chunksCache.l2.suffix ) }}.{{ include "loki.namespace" $ }}.svc' + # -- Batchsize for sending and receiving chunks from chunks cache + batchSize: 4 + # -- Parallel threads for sending and receiving chunks from chunks cache + parallelism: 5 + # -- Memcached operation timeout + timeout: 2000ms + # -- Specify how long cached chunks should be stored in the chunks-cache-l2 before being expired + defaultValidity: 0s + # -- Specify how long cached chunks should be stored in the chunks-cache-l2 before being expired + replicas: 1 + # -- Port of the chunks-cache-l2 service + port: 11211 + # -- Amount of memory allocated to chunks-cache-l2 for object storage (in MB). + allocatedMemory: 8192 + # -- Maximum item memory for chunks-cache-l2 (in MB). + maxItemMemory: 5 + # -- Maximum number of connections allowed + connectionLimit: 16384 + # -- Max memory to use for cache write back + writebackSizeLimit: 500MB + # -- Max number of objects to use for cache write back + writebackBuffer: 500000 + # -- Number of parallel threads for cache write back + writebackParallelism: 1 + # -- Extra init containers for chunks-cache-l2 pods + initContainers: [] + # -- Annotations for the chunks-cache-l2 pods + annotations: {} + # -- Node selector for chunks-cach-l2 pods + nodeSelector: {} + # -- Affinity for chunks-cache-l2 pods + affinity: {} + # -- topologySpreadConstraints allows to customize the default topologySpreadConstraints. This can be either a single dict as shown below or a slice of topologySpreadConstraints. + # labelSelector is taken from the constraint itself (if it exists) or is generated by the chart using the same selectors as for services. + topologySpreadConstraints: [] + # maxSkew: 1 + # topologyKey: kubernetes.io/hostname + # whenUnsatisfiable: ScheduleAnyway + # -- Tolerations for chunks-cache-l2 pods + tolerations: [] + # -- Pod Disruption Budget maxUnavailable + maxUnavailable: 1 + # -- DNSConfig for chunks-cache-l2 + dnsConfig: {} + # -- The name of the PriorityClass for chunks-cache-l2 pods + priorityClassName: null + # -- Use the host's user namespace in chunks-cache-l2 pods + hostUsers: nil + # -- Labels for chunks-cache-l2 pods + podLabels: {} + # -- Annotations for chunks-cache-l2 pods + podAnnotations: {} + # -- Management policy for chunks-cache-l2 pods + podManagementPolicy: Parallel + # -- Grace period to allow the chunks-cache-l2 to shutdown before it is killed + terminationGracePeriodSeconds: 60 + # -- Stateful chunks-cache strategy + statefulStrategy: + type: RollingUpdate + # -- Add extended options for chunks-cache-l2 memcached container. The format is the same as for the memcached -o/--extend flag. + # Example: + # extraExtendedOptions: 'tls,no_hashexpand' + extraExtendedOptions: "" + # -- Additional CLI args for chunks-cache-l2 + extraArgs: {} + # -- Additional containers to be added to the chunks-cache-l2 pod. + extraContainers: [] + # -- Additional volumes to be added to the chunks-cache-l2 pod (applies to both memcached and exporter containers). + # Example: + # extraVolumes: + # - name: extra-volume + # secret: + # secretName: extra-volume-secret + extraVolumes: [] + # -- Additional volume mounts to be added to the chunks-cache-l2 pod (applies to both memcached and exporter containers). + # Example: + # extraVolumeMounts: + # - name: extra-volume + # mountPath: /etc/extra-volume + # readOnly: true + extraVolumeMounts: [] + # -- Resource requests and limits for the chunks-cache-l2 + # By default a safe memory limit will be requested based on allocatedMemory value (floor (* 1.2 allocatedMemory)). + resources: null + # -- Service annotations and labels + service: + annotations: {} + labels: {} + # -- Persistence settings for the chunks-cache-l2 + persistence: + # -- Enable creating PVCs for the chunks-cache-l2 + enabled: false + # -- Size of persistent disk, must be in G or Gi + storageSize: 10G + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Volume mount path + mountPath: /data + labels: {} +###################################################################################################################### +# +# Subchart configurations +# +###################################################################################################################### +# -- Setting for the Grafana Rollout Operator https://github.com/grafana/helm-charts/tree/main/charts/rollout-operator +rollout_operator: + enabled: false + # -- podSecurityContext is the pod security context for the rollout operator. + # When installing on OpenShift, override podSecurityContext settings with + # + # rollout_operator: + # podSecurityContext: + # fsGroup: null + # runAsGroup: null + # runAsUser: null + podSecurityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + seccompProfile: + type: RuntimeDefault + # Set the container security context + securityContext: + readOnlyRootFilesystem: true + capabilities: + drop: [ALL] + allowPrivilegeEscalation: false +# -- Configuration for the minio subchart +minio: + enabled: false + replicas: 1 + # Minio requires 2 to 16 drives for erasure code (drivesPerNode * replicas) + # https://docs.min.io/docs/minio-erasure-code-quickstart-guide + # Since we only have 1 replica, that means 2 drives must be used. + drivesPerNode: 2 + # root user; not used for GEL authentication + rootUser: root-user + rootPassword: supersecretpassword + # The first user in the list below is used for Loki/GEL authentication. + # You can add additional users if desired; they will not impact Loki/GEL. + # `accessKey` = username, `secretKey` = password + users: + - accessKey: logs-user + secretKey: supersecretpassword + policy: readwrite + buckets: + - name: chunks + policy: none + purge: false + - name: ruler + policy: none + purge: false + - name: admin + policy: none + purge: false + persistence: + size: 5Gi + annotations: {} + resources: + requests: + cpu: 100m + memory: 128Mi + # Allow the address used by Loki to refer to Minio to be overridden + address: null + +# Create extra manifests via values +# Can be a list or dictionary, both are passed through `tpl`. If dict, keys are ignored and only values are used. +# Objects can also be defined as multiline strings, useful for templating field names +extraObjects: null +# - apiVersion: v1 +# kind: ConfigMap +# metadata: +# name: loki-alerting-rules +# data: +# loki-alerting-rules.yaml: |- +# groups: +# - name: example +# rules: +# - alert: example +# expr: | +# sum(count_over_time({app="loki"} |~ "error")) > 0 +# for: 3m +# labels: +# severity: warning +# category: logs +# annotations: +# message: "loki has encountered errors" +# - | +# apiVersion: v1 +# kind: Secret +# type: Opaque +# metadata: +# name: loki-distributed-basic-auth +# data: +# {{- range .Values.loki.tenants }} +# {{ .name }}: {{ b64enc .password | quote }} +# {{- end }} + +sidecar: + image: + # -- The Docker registry and image for the k8s sidecar + repository: docker.io/kiwigrid/k8s-sidecar + # -- Docker image tag + tag: 1.30.10 + # -- Docker image sha. If empty, no sha will be used + sha: "" + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Resource requests and limits for the sidecar + resources: {} + # limits: + # cpu: 100m + # memory: 100Mi + # requests: + # cpu: 50m + # memory: 50Mi + # -- The SecurityContext for the sidecar. + securityContext: + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + allowPrivilegeEscalation: false + # -- Set to true to skip tls verification for kube api calls. + skipTlsVerify: false + # -- Ensure that rule files aren't conflicting and being overwritten by prefixing their name with the namespace they are defined in. + enableUniqueFilenames: false + # -- Readiness probe definition. Probe is disabled on the sidecar by default. + readinessProbe: {} + # -- Liveness probe definition. Probe is disabled on the sidecar by default. + livenessProbe: {} + # -- Startup probe definition. Probe is disabled on the sidecar by default. + startupProbe: {} + rules: + # -- Whether or not to create a sidecar to ingest rule from specific ConfigMaps and/or Secrets. + enabled: true + # -- Label that the configmaps/secrets with rules will be marked with. + label: loki_rule + # -- Label value that the configmaps/secrets with rules will be set to. + labelValue: "" + # -- Folder into which the rules will be placed. + folder: /rules + # -- The annotation overwriting the folder value. + # The annotation value can be either an absolute or a relative path. Relative paths will be relative to FOLDER. + # Useful for multi-tenancy setups. + folderAnnotation: null + # -- Comma separated list of namespaces. If specified, the sidecar will search for config-maps/secrets inside these namespaces. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify 'ALL' to search in all namespaces. + searchNamespace: null + # -- Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH request, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + # -- Search in configmap, secret, or both. + resource: both + # -- Absolute path to the shell script to execute after a configmap or secret has been reloaded. + script: null + # -- WatchServerTimeout: request to the server, asking it to cleanly close the connection after that. + # defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S. + watchServerTimeout: 60 + # + # -- WatchClientTimeout: is a client-side timeout, configuring your local socket. + # If you have a network outage dropping all packets with no RST/FIN, + # this is how long your client waits before realizing & dropping the connection. + # Defaults to 66sec. + watchClientTimeout: 60 + # -- Log level of the sidecar container. + logLevel: INFO + +# -- Monitoring section determines which monitoring features to enable +monitoring: + # Dashboards for monitoring Loki + dashboards: + # -- If enabled, create configmap with dashboards for monitoring Loki + enabled: false + # -- Alternative namespace to create dashboards ConfigMap in + namespace: null + # -- Additional annotations for the dashboards ConfigMap + annotations: {} + # -- Labels for the dashboards ConfigMap + labels: + grafana_dashboard: "1" + # -- Recording rules for monitoring Loki, required for some dashboards + rules: + # -- If enabled, create PrometheusRule resource with Loki recording rules + enabled: false + # -- Include alerting rules + alerting: true + # -- Specify which individual alerts should be disabled + # -- Instead of turning off each alert one by one, set the .monitoring.rules.alerting value to false instead. + # -- If you disable all the alerts and keep .monitoring.rules.alerting set to true, the chart will fail to render. + # + # -- DEPRECATED: use monitoring.rules.configs.*.enabled instead + disabled: {} + # LokiRequestErrors: true + # LokiRequestPanics: true + + configs: + LokiRequestErrors: + enabled: true + for: 15m + lookbackPeriod: 2m + severity: critical + threshold: 10 + LokiRequestPanics: + enabled: true + lookbackPeriod: 10m + severity: critical + threshold: 0 + LokiRequestLatency: + enabled: true + for: 15m + severity: critical + threshold: 1 + LokiTooManyCompactorsRunning: + enabled: true + for: 5m + severity: warning + LokiCanaryLatency: + enabled: true + for: 15m + lookbackPeriod: 5m + severity: warning + threshold: 5 + + # -- Alternative namespace to create PrometheusRule resources in + namespace: null + # -- Additional annotations for the rules PrometheusRule resource + annotations: {} + # -- Additional labels for the rules PrometheusRule resource + labels: {} + # -- Additional annotations for PrometheusRule alerts + additionalRuleAnnotations: {} + # e.g.: + # additionalRuleAnnotations: + # runbook_url: "https://runbooks.example.com/oncall/loki" + # summary: "What this alert means and how to respond" + # -- Additional labels for PrometheusRule alerts + additionalRuleLabels: {} + # -- Additional groups to add to the rules file + additionalGroups: [] + # - name: additional-loki-rules + # rules: + # - record: job:loki_request_duration_seconds_bucket:sum_rate + # expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job) + # - record: job_route:loki_request_duration_seconds_bucket:sum_rate + # expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route) + # - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + # expr: sum(rate(container_cpu_usage_seconds_total[1m])) by (node, namespace, pod, container) + # -- ServiceMonitor configuration + serviceMonitor: + # -- If enabled, ServiceMonitor resources for Prometheus Operator are created + enabled: false + # -- Namespace selector for ServiceMonitor resources + namespaceSelector: {} + # -- ServiceMonitor annotations + annotations: {} + # -- Additional ServiceMonitor labels + labels: {} + # -- ServiceMonitor scrape interval + # Default is 15s because included recording rules use a 1m rate, and scrape interval needs to be at + # least 1/4 rate interval. + interval: 15s + # -- ServiceMonitor scrape timeout in Go duration format (e.g. 15s) + scrapeTimeout: null + # -- ServiceMonitor relabel configs to apply to samples before scraping + # https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#relabelconfig + relabelings: [] + # -- ServiceMonitor metric relabel configs to apply to samples before ingestion + # https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#endpoint + metricRelabelings: [] + # -- ServiceMonitor will use http by default, but you can pick https as well + scheme: http + # -- ServiceMonitor will use these tlsConfig settings to make the health check requests + tlsConfig: null + # -- DEPRECATED If defined, will create a MetricsInstance for the Grafana Agent Operator. + metricsInstance: + # -- If enabled, MetricsInstance resources for Grafana Agent Operator are created + enabled: true + # -- MetricsInstance annotations + annotations: {} + # -- Additional MetricsInstance labels + labels: {} + # -- If defined a MetricsInstance will be created to remote write metrics. + remoteWrite: null + # -- DEPRECATED Self monitoring determines whether Loki should scrape its own logs. + # This feature relies on Grafana Agent Operator, which is deprecated. + # It will create custom resources for GrafanaAgent, LogsInstance, and PodLogs to configure + # scrape configs to scrape its own logs with the labels expected by the included dashboards. + selfMonitoring: + enabled: false + # -- Tenant to use for self monitoring + tenant: + # -- Name of the tenant + name: "self-monitoring" + # -- Password of the gateway for Basic auth + password: null + # -- Namespace to create additional tenant token secret in. Useful if your Grafana instance + # is in a separate namespace. Token will still be created in the canary namespace. + # @default -- The same namespace as the loki chart is installed in. + secretNamespace: '{{ include "loki.namespace" . }}' + # -- DEPRECATED Grafana Agent configuration + grafanaAgent: + # -- DEPRECATED Controls whether to install the Grafana Agent Operator and its CRDs. + # Note that helm will not install CRDs if this flag is enabled during an upgrade. + # In that case install the CRDs manually from https://github.com/grafana/agent/tree/main/production/operator/crds + installOperator: false + # -- Grafana Agent annotations + annotations: {} + # -- Additional Grafana Agent labels + labels: {} + # -- Enable the config read api on port 8080 of the agent + enableConfigReadAPI: false + # -- The name of the PriorityClass for GrafanaAgent pods + priorityClassName: null + # -- Resource requests and limits for the grafanaAgent pods + resources: {} + # limits: + # memory: 200Mi + # requests: + # cpu: 50m + # memory: 100Mi + # -- Tolerations for GrafanaAgent pods + tolerations: [] + # PodLogs configuration + podLogs: + # -- PodLogs version + apiVersion: monitoring.grafana.com/v1alpha1 + # -- PodLogs annotations + annotations: {} + # -- Additional PodLogs labels + labels: {} + # -- PodLogs relabel configs to apply to samples before scraping + # https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#relabelconfig + relabelings: [] + # -- Additional pipeline stages to process logs after scraping + # https://grafana.com/docs/agent/latest/operator/api/#pipelinestagespec-a-namemonitoringgrafanacomv1alpha1pipelinestagespeca + additionalPipelineStages: [] + # LogsInstance configuration + logsInstance: + # -- LogsInstance annotations + annotations: {} + # -- Additional LogsInstance labels + labels: {} + # -- Additional clients for remote write + clients: null + +# -- DEPRECATED Configuration for the table-manager. The table-manager is only necessary when using a deprecated +# index type such as Cassandra, Bigtable, or DynamoDB, it has not been necessary since loki introduced self- +# contained index types like 'boltdb-shipper' and 'tsdb'. This will be removed in a future helm chart. +tableManager: + # -- Specifies whether the table-manager should be enabled + enabled: false + image: + # -- The Docker registry for the table-manager image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the table-manager image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the table-manager image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for table-manager pods + priorityClassName: null + # -- Labels for table-manager pods + podLabels: {} + # -- Annotations for table-manager deployment + annotations: {} + # -- Annotations for table-manager pods + podAnnotations: {} + service: + # -- Annotations for table-manager Service + annotations: {} + # -- Additional labels for table-manager Service + labels: {} + # -- Additional CLI args for the table-manager + extraArgs: [] + # -- Environment variables to add to the table-manager pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the table-manager pods + extraEnvFrom: [] + # -- Volume mounts to add to the table-manager pods + extraVolumeMounts: [] + # -- Volumes to add to the table-manager pods + extraVolumes: [] + # -- Resource requests and limits for the table-manager + resources: {} + # -- Containers to add to the table-manager pods + extraContainers: [] + # -- Grace period to allow the table-manager to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Use the host's user namespace in table-manager pods + hostUsers: nil + # -- Affinity for table-manager pods. + # @default -- Hard node and anti-affinity + # The value will be passed through tpl. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: table-manager + app.kubernetes.io/name: '{{ include "loki.name" . }}' + app.kubernetes.io/instance: "{{ .Release.Name }}" + topologyKey: kubernetes.io/hostname + # -- DNS config table-manager pods + dnsConfig: {} + # -- Node selector for table-manager pods + nodeSelector: {} + # -- Tolerations for table-manager pods + tolerations: [] + # -- Enable deletes by retention + retention_deletes_enabled: false + # -- Set retention period + retention_period: 0 diff --git a/applications/base/services/observability/loki/helmrelease.yaml b/applications/base/services/observability/loki/helmrelease.yaml new file mode 100644 index 0000000..30c0ae3 --- /dev/null +++ b/applications/base/services/observability/loki/helmrelease.yaml @@ -0,0 +1,36 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: loki + namespace: observability +spec: + interval: 5m + timeout: 10m + driftDetection: + mode: enabled + install: + remediation: + retries: 3 + remediateLastFailure: true + upgrade: + remediation: + retries: 0 + remediateLastFailure: false + targetNamespace: observability + chart: + spec: + chart: loki + version: 6.45.2 + sourceRef: + kind: HelmRepository + name: grafana + namespace: observability + valuesFrom: + - kind: Secret + name: loki-values-base + valuesKey: hardened.yaml + - kind: Secret + name: loki-values-override + valuesKey: override.yaml + optional: true diff --git a/applications/base/services/observability/loki/kustomization.yaml b/applications/base/services/observability/loki/kustomization.yaml new file mode 100644 index 0000000..c612121 --- /dev/null +++ b/applications/base/services/observability/loki/kustomization.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - "source.yaml" + - "helmrelease.yaml" +secretGenerator: + - name: loki-values-base + namespace: observability + type: Opaque + files: + - hardened.yaml=helm-values/hardened-values-v6.45.2.yaml + options: + disableNameSuffixHash: true diff --git a/applications/base/services/observability/loki/source.yaml b/applications/base/services/observability/loki/source.yaml new file mode 100644 index 0000000..0808c20 --- /dev/null +++ b/applications/base/services/observability/loki/source.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: grafana + namespace: observability +spec: + url: https://grafana.github.io/helm-charts + interval: 1h diff --git a/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml b/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml deleted file mode 100644 index 1c11fdd..0000000 --- a/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1 copy.yaml +++ /dev/null @@ -1,233 +0,0 @@ -# Security configurations for OpenTelemetry Kube Stack -# Version: 0.11.1 - -# Cluster name for identification -clusterName: "openCenter-cluster" - -# OpenTelemetry Operator configuration -opentelemetry-operator: - enabled: true - manager: - # Security context for operator manager - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - # Resource limits for operator - resources: - requests: - memory: "128Mi" - cpu: "100m" - limits: - memory: "256Mi" - cpu: "500m" - # Admission webhooks configuration - admissionWebhooks: - failurePolicy: "Ignore" - # Security context for webhooks - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - -# Default collector configuration with security hardening -defaultCRConfig: - enabled: true - mode: deployment - replicas: 2 - - # Security contexts - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - - podSecurityContext: - runAsNonRoot: true - runAsUser: 65534 - fsGroup: 65534 - seccompProfile: - type: RuntimeDefault - - # Container security context - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - - # Resource limits - resources: - requests: - memory: "128Mi" - cpu: "100m" - limits: - memory: "512Mi" - cpu: "500m" - - # Node selector for Linux nodes - nodeSelector: - kubernetes.io/os: linux - - # Basic OTLP configuration - config: - receivers: - otlp: - protocols: - grpc: - endpoint: ${env:MY_POD_IP}:4317 - http: - endpoint: ${env:MY_POD_IP}:4318 - processors: - batch: - timeout: 1s - send_batch_size: 1024 - memory_limiter: - limit_mib: 400 - spike_limit_mib: 100 - check_interval: 5s - exporters: - logging: - loglevel: info - service: - pipelines: - traces: - receivers: [otlp] - processors: [memory_limiter, batch] - exporters: [logging] - metrics: - receivers: [otlp] - processors: [memory_limiter, batch] - exporters: [logging] - logs: - receivers: [otlp] - processors: [memory_limiter, batch] - exporters: [logging] - -# Kube State Metrics configuration -kubeStateMetrics: - enabled: true - -kube-state-metrics: - # Security context - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - # Resource limits - resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "128Mi" - cpu: "200m" - # Node selector - nodeSelector: - kubernetes.io/os: linux - # Prometheus monitoring - prometheus: - monitor: - enabled: true - honorLabels: true - -# Node Exporter configuration -nodeExporter: - enabled: true - -prometheus-node-exporter: - # Security context - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - # Resource limits - resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "128Mi" - cpu: "200m" - # Node selector - nodeSelector: - kubernetes.io/os: linux - # Prometheus monitoring - prometheus: - monitor: - enabled: true - jobLabel: node-exporter - -# Kubernetes service monitors (disabled to avoid conflicts with existing monitoring) -kubernetesServiceMonitors: - enabled: false - -# Individual component monitors (disabled to avoid conflicts) -kubeApiServer: - enabled: false -kubelet: - enabled: false -kubeControllerManager: - enabled: false -coreDns: - enabled: false -kubeEtcd: - enabled: false -kubeScheduler: - enabled: false -kubeProxy: - enabled: false - -# CRDs installation -crds: - installOtel: true - installPrometheus: false # Disabled to avoid conflicts with existing Prometheus stack - -# Cleanup job configuration -cleanupJob: - enabled: true - image: - repository: rancher/kubectl - tag: v1.34.1 - # Security context for cleanup job - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - containerSecurityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true \ No newline at end of file From ed9c8b4738c7e596f833a2fc6fe23f9ee0b1f20f Mon Sep 17 00:00:00 2001 From: Pratik Bandarkar Date: Wed, 5 Nov 2025 13:29:54 +0000 Subject: [PATCH 07/10] fix: update default values.yaml of otel --- .../helm-values/hardened-values-v0.11.1.yaml | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml b/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml index cfd2b81..5a3ca7a 100644 --- a/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml +++ b/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml @@ -492,15 +492,15 @@ collectors: logsCollection: enabled: true kubeletMetrics: - enabled: true + enabled: false hostMetrics: - enabled: true + enabled: false kubernetesAttributes: enabled: true kubernetesEvents: enabled: true clusterMetrics: - enabled: true + enabled: false config: receivers: otlp: @@ -524,7 +524,21 @@ collectors: timeout: 1s send_batch_max_size: 1500 exporters: - debug: {} + otlphttp/loki: + endpoint: http://observability-loki-gateway.observability.svc.cluster.local/otlp + headers: + X-Scope-OrgID: "default" + compression: gzip + timeout: 30s + retry_on_failure: + enabled: true + initial_interval: 1s + max_interval: 10s + max_elapsed_time: 0s + sending_queue: + enabled: true + num_consumers: 10 + queue_size: 2000 service: pipelines: @@ -554,7 +568,7 @@ collectors: - resource/hostname - batch exporters: - - debug + - otlphttp/loki # Cluster role configuration clusterRole: From abc259e38ad37caf7787b6b58876714fddd80d12 Mon Sep 17 00:00:00 2001 From: Pratik Bandarkar Date: Wed, 5 Nov 2025 13:47:34 +0000 Subject: [PATCH 08/10] docs: update readme.md of otel --- .../opentelemetry-kube-stack/README.md | 215 ++---------------- 1 file changed, 15 insertions(+), 200 deletions(-) diff --git a/applications/base/services/observability/opentelemetry-kube-stack/README.md b/applications/base/services/observability/opentelemetry-kube-stack/README.md index b4f636c..6176ca9 100644 --- a/applications/base/services/observability/opentelemetry-kube-stack/README.md +++ b/applications/base/services/observability/opentelemetry-kube-stack/README.md @@ -1,204 +1,19 @@ -# OpenTelemetry Kube Stack +# OpenTelemetry Kube Stack – Base Configuration -The OpenTelemetry Kube Stack is a comprehensive observability solution that provides a complete OpenTelemetry setup for Kubernetes clusters. It includes the OpenTelemetry Operator, collectors, and essential monitoring components. +This directory contains the **base manifests** for deploying the [OpenTelemetry Kube Stack](https://opentelemetry.io/), a **unified observability framework** for collecting, processing, and exporting **traces and logs** from Kubernetes workloads and infrastructure components. +It is designed to be **consumed by cluster repositories** as a remote base, allowing each cluster to apply **custom overrides** as needed. -## Overview +--- -This chart deploys: -- **OpenTelemetry Operator**: Manages OpenTelemetry collectors and instrumentation -- **OpenTelemetry Collector**: Collects, processes, and exports telemetry data -- **Kube State Metrics**: Exposes cluster-level metrics about Kubernetes objects -- **Node Exporter**: Collects hardware and OS metrics from cluster nodes +## About OpenTelemetry Kube Stack -## Configuration - -### Chart Information -- **Chart**: opentelemetry-kube-stack -- **Version**: 0.11.1 -- **App Version**: 0.129.1 -- **Repository**: https://open-telemetry.github.io/opentelemetry-helm-charts - -### Namespace -Deployed in the `observability` namespace alongside other monitoring components. - -### Security Hardening - -The deployment includes comprehensive security configurations: - -#### Container Security -- Non-root execution (`runAsNonRoot: true`) -- Specific user ID (`runAsUser: 65534`) -- Security profiles (`seccompProfile.type: RuntimeDefault`) -- Capability dropping (`capabilities.drop: [ALL]`) -- Read-only root filesystem (`readOnlyRootFilesystem: true`) -- Privilege escalation disabled (`allowPrivilegeEscalation: false`) - -#### Resource Management -- CPU and memory limits defined for all components -- Resource requests set for proper scheduling -- Memory limiter processor configured for collectors - -#### Network Security -- OTLP receivers configured on standard ports (4317/4318) -- Service monitors enabled for Prometheus integration -- Node selectors for Linux-only deployment - -### Key Features - -#### OpenTelemetry Operator -- Manages collector lifecycle and configuration -- Supports auto-instrumentation for applications -- Webhook-based configuration validation - -#### Collector Configuration -- OTLP receivers for traces, metrics, and logs -- Batch processing for efficient data handling -- Memory limiting to prevent resource exhaustion -- Logging exporter for initial setup (can be customized) - -#### Monitoring Integration -- Prometheus ServiceMonitor resources enabled -- Kube State Metrics for cluster-level observability -- Node Exporter for infrastructure metrics -- Compatible with existing Prometheus stack - -### Customization - -#### Collector Configuration -The default collector configuration can be extended by modifying the `config` section in the hardened values file. Common customizations include: - -```yaml -config: - exporters: - otlp: - endpoint: "your-backend:4317" - tls: - insecure: false - prometheusremotewrite: - endpoint: "https://prometheus.example.com/api/v1/write" -``` - -#### Resource Scaling -Adjust resource limits based on cluster size and telemetry volume: - -```yaml -resources: - requests: - memory: "256Mi" - cpu: "200m" - limits: - memory: "1Gi" - cpu: "1000m" -``` - -### Dependencies - -This chart has dependencies on: -- OpenTelemetry CRDs (installed automatically) -- Kubernetes 1.19+ for proper ServiceMonitor support -- Prometheus Operator (for ServiceMonitor resources) - -### Compatibility - -#### Existing Services -The configuration is designed to work alongside existing observability services: -- **kube-prometheus-stack**: Kubernetes service monitors disabled to avoid conflicts -- **Prometheus CRDs**: Installation disabled (uses existing CRDs) -- **Grafana**: Compatible with OpenTelemetry data sources - -#### OpenTelemetry Operator -This deployment may conflict with the existing `opentelemetry-operator` service. Consider: -- Using this as a replacement for the standalone operator -- Disabling the operator component if only collectors are needed -- Coordinating CRD management between deployments - -### Monitoring and Observability - -#### Health Checks -Monitor the deployment status: -```bash -kubectl get helmrelease opentelemetry-kube-stack -n observability -kubectl get pods -n observability -l app.kubernetes.io/name=opentelemetry-kube-stack -``` - -#### Collector Status -Check OpenTelemetry collector status: -```bash -kubectl get opentelemetrycollector -n observability -kubectl logs -n observability -l app.kubernetes.io/component=opentelemetry-collector -``` - -#### Metrics Availability -Verify metrics collection: -```bash -kubectl port-forward -n observability svc/opentelemetry-kube-stack-collector 8888:8888 -curl http://localhost:8888/metrics -``` - -### Troubleshooting - -#### Common Issues - -1. **CRD Conflicts**: If OpenTelemetry CRDs already exist, disable installation: - ```yaml - crds: - installOtel: false - ``` - -2. **Resource Constraints**: Increase resource limits if collectors are OOMKilled: - ```yaml - resources: - limits: - memory: "1Gi" - ``` - -3. **Webhook Failures**: If admission webhooks cause issues: - ```yaml - opentelemetry-operator: - admissionWebhooks: - failurePolicy: "Ignore" - ``` - -#### Debug Commands -```bash -# Check operator logs -kubectl logs -n observability -l app.kubernetes.io/name=opentelemetry-operator - -# Describe collector resources -kubectl describe opentelemetrycollector -n observability - -# Check service monitor status -kubectl get servicemonitor -n observability -``` - -### Integration Examples - -#### Application Instrumentation -Enable auto-instrumentation for applications: -```yaml -apiVersion: opentelemetry.io/v1alpha1 -kind: Instrumentation -metadata: - name: my-instrumentation -spec: - exporter: - endpoint: http://opentelemetry-kube-stack-collector:4317 - propagators: - - tracecontext - - baggage -``` - -#### Custom Exporters -Configure exporters for your observability backend: -```yaml -config: - exporters: - jaeger: - endpoint: jaeger-collector:14250 - tls: - insecure: true - prometheus: - endpoint: "0.0.0.0:8889" -``` - -This deployment provides a solid foundation for OpenTelemetry-based observability in Kubernetes environments with enterprise-grade security and monitoring capabilities. \ No newline at end of file +- Provides a **complete observability foundation** for Kubernetes clusters, integrating **traces and logs** under a single open standard. +- Deployed using the **OpenTelemetry Operator**, which manages collectors, instrumentation, and telemetry pipelines declaratively via Kubernetes manifests. +- Collects telemetry data from: + - **Kubernetes system components** (API server, kubelet, scheduler, etc.) + - **Application workloads** instrumented with OpenTelemetry SDKs or auto-instrumentation. +- Processes data through **OpenTelemetry Collectors**, which perform transformation, filtering, batching, and enrichment before export. +- Supports multiple backends including **Prometheus**, **Tempo**, **Loki**, **Grafana**, **Jaeger**, and **OTLP-compatible endpoints**. +- Enables **auto-discovery and dynamic configuration** for Kubernetes workloads, simplifying instrumentation and reducing manual setup. +- Designed for **scalability and resilience**, supporting both **agent** and **gateway** modes for distributed telemetry collection. +- Natively integrates with **Grafana** and other observability tools for unified dashboards and correlation between metrics, traces, and logs. From 33ecda162593a2132e140a4eacd97b5aba434fc9 Mon Sep 17 00:00:00 2001 From: Pratik Bandarkar Date: Wed, 5 Nov 2025 13:57:31 +0000 Subject: [PATCH 09/10] fix: Update presets and remove metrics pipeline --- .../helm-values/hardened-values-v0.11.1.yaml | 16 +++------------- .../opentelemetry-kube-stack/helmrelease.yaml | 2 +- .../opentelemetry-kube-stack/source.yaml | 2 +- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml b/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml index 5a3ca7a..717b373 100644 --- a/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml +++ b/applications/base/services/observability/opentelemetry-kube-stack/helm-values/hardened-values-v0.11.1.yaml @@ -492,15 +492,15 @@ collectors: logsCollection: enabled: true kubeletMetrics: - enabled: false + enabled: true hostMetrics: - enabled: false + enabled: true kubernetesAttributes: enabled: true kubernetesEvents: enabled: true clusterMetrics: - enabled: false + enabled: true config: receivers: otlp: @@ -551,15 +551,6 @@ collectors: - batch exporters: - debug - metrics: - receivers: - - otlp - processors: - - resourcedetection/env - - resource/hostname - - batch - exporters: - - debug logs: receivers: - otlp @@ -1764,4 +1755,3 @@ extraObjects: [] # objectName: imagePullSecret # secretName: demo-image-pull-secrets # type: kubernetes.io/dockerconfigjson - diff --git a/applications/base/services/observability/opentelemetry-kube-stack/helmrelease.yaml b/applications/base/services/observability/opentelemetry-kube-stack/helmrelease.yaml index 722e55e..54050d2 100644 --- a/applications/base/services/observability/opentelemetry-kube-stack/helmrelease.yaml +++ b/applications/base/services/observability/opentelemetry-kube-stack/helmrelease.yaml @@ -34,4 +34,4 @@ spec: - kind: Secret name: opentelemetry-kube-stack-values-override valuesKey: override.yaml - optional: true \ No newline at end of file + optional: true diff --git a/applications/base/services/observability/opentelemetry-kube-stack/source.yaml b/applications/base/services/observability/opentelemetry-kube-stack/source.yaml index 9393c6b..4c3e04b 100644 --- a/applications/base/services/observability/opentelemetry-kube-stack/source.yaml +++ b/applications/base/services/observability/opentelemetry-kube-stack/source.yaml @@ -5,4 +5,4 @@ metadata: name: opentelemetry spec: url: https://open-telemetry.github.io/opentelemetry-helm-charts - interval: 1h \ No newline at end of file + interval: 1h From 9fb35400bf08ed8476d34b71aa68d59109f209a4 Mon Sep 17 00:00:00 2001 From: Miguel Parada Date: Tue, 4 Nov 2025 13:04:29 -0600 Subject: [PATCH 10/10] fix: use null_resource vs local_file for id_rsa (#40) --- .../openstack/lib/openstack-keypair/main.tf | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/iac/cloud/openstack/lib/openstack-keypair/main.tf b/iac/cloud/openstack/lib/openstack-keypair/main.tf index 7ecb6d0..1a0f3a9 100644 --- a/iac/cloud/openstack/lib/openstack-keypair/main.tf +++ b/iac/cloud/openstack/lib/openstack-keypair/main.tf @@ -2,14 +2,17 @@ resource "openstack_compute_keypair_v2" "ssh_keypair" { name = replace(format("%skey", var.naming_prefix), ".", "-") } -resource "local_file" "private_key" { - content = openstack_compute_keypair_v2.ssh_keypair.private_key - filename = "${path.root}/id_rsa" - file_permission = "0600" -} +resource "null_resource" "save_ssh_keys" { + triggers = { + key_id = openstack_compute_keypair_v2.ssh_keypair.id + } -resource "local_file" "public_key" { - content = openstack_compute_keypair_v2.ssh_keypair.public_key - filename = "${path.root}/id_rsa.pub" - file_permission = "0644" -} + provisioner "local-exec" { + command = <<-EOT + echo '${openstack_compute_keypair_v2.ssh_keypair.private_key}' > ${path.root}/id_rsa + chmod 600 ${path.root}/id_rsa + echo '${openstack_compute_keypair_v2.ssh_keypair.public_key}' > ${path.root}/id_rsa.pub + chmod 644 ${path.root}/id_rsa.pub + EOT + } +} \ No newline at end of file