/
values.yaml
910 lines (751 loc) · 37 KB
/
values.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
# This values file provides the default values for the chart. Placeholders like
# <CLUSTER_NAME> will be provided via the values.yaml provided during the cluster
# onboarding process--or you can provide your own with the appropriate values
# given to you during initial onboarding for an initial cluster and region.
nameOverride: ""
fullnameOverride: ""
# array of pair "name: secret"
imagePullSecrets: []
otel:
# e.g. otel.collector.na-01.cloud.solarwinds.com:443
# A list of all available OTEL endpoints: https://documentation.solarwinds.com/en/success_center/observability/content/system_requirements/endpoints.htm
endpoint: <OTEL_ENVOY_ADDRESS>
tls_insecure: false
# SWO API token used for authentication. If filled it will create secret that will be used by the collector
api_token: ""
# The OTEL collector supports an HTTPS proxy. Specify the full URL of the HTTPS
# proxy here. e.g. https_proxy: "https://myproxy.mydomain.com:8080"
https_proxy_url: ""
image:
repository: solarwinds/swi-opentelemetry-collector
# if not set appVersion field from Chart.yaml is used
tag: ""
pullPolicy: IfNotPresent
windows:
enabled: true
image:
repository: solarwinds/swi-opentelemetry-collector
# if not set appVersion field from Chart.yaml is used
tag: ""
pullPolicy: IfNotPresent
init_images:
swi_endpoint_check:
repository: "fullstorydev/grpcurl"
tag: "v1.8.9"
pullPolicy: IfNotPresent
busy_box:
repository: "busybox"
tag: "1.36.1"
pullPolicy: IfNotPresent
node_collector:
terminationGracePeriodSeconds: 600
sending_queue:
enabled: true
# Number of consumers that dequeue batches; ignored if enabled is false
num_consumers: 20
# Maximum number of batches kept in memory before dropping; ignored if enabled is false User should calculate this as num_seconds * requests_per_second / requests_per_batch where:
# * num_seconds is the number of seconds to buffer in case of a backend outage
# * requests_per_second is the average number of requests per seconds
# * requests_per_batch is the average number of requests per batch (if the batch processor is used, the metric send_batch_size can be used for estimation)
queue_size: 1000
# Configuration for persistent data storage of sending queue of node-collector
# For detailed explanation see https://github.com/open-telemetry/opentelemetry-collector/blob/main/exporter/exporterhelper/README.md#persistent-queue
persistent_storage:
enabled: false
directory: /var/lib/swo/sending_queue
retry_on_failure:
enabled: true
# Time to wait after the first failure before retrying; ignored if enabled is false
initial_interval: 10s
# Is the upper bound on backoff; ignored if enabled is false
max_interval: 30s
# Is the maximum amount of time spent trying to send a batch; ignored if enabled is false
max_elapsed_time: 300s
# Time to wait per individual attempt to send data to SWO
timeout: 15s
# Configuration for metrics collection
metrics:
# Define whether metrics will be collected and sent
enabled: true
# configuration for metric discovery
autodiscovery:
prometheusEndpoints:
# Define whether metrics will be discovered and scraped by prometheus annotations
enabled: true
# Additional custom rule for discovery (following rule is always present: type == "pod" && annotations["prometheus.io/scrape"] == "true")
# Available fields
# `id` - ID of source endpoint
# `name` - name of the pod
# `namespace` - namespace of the pod
# `uid` - unique id of the pod
# `labels` - map of labels set on the pod
# `annotations` - map of annotations set on the pod
# Example: namespace == "test-namespace" && labels["app"] == "test-app"
additionalRules: ""
podMonitors:
rules: []
# Additional custom rules for discovery
# Available fields for podMonitors rules are the same as for additionalRules
# - rule: labels["example"] == "value"
# metrics_path: "/metrics"
# endpoint_port: 8080
customTransformations:
# list of metrics that are counters should be converted to rate
counterToRate: []
# This filter is applied after metric processing, it is the place where metrics could be filtered out
# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor for configuration reference
filter: {}
# Check if SWI OTEL endpoint is reachable
swi_endpoint_check: true
# Check if Prometheus endpoint (provided by otel.metrics.prometheus.url) is reachable
prometheus_check: false
sending_queue:
enabled: true
# Number of consumers that dequeue batches; ignored if enabled is false
num_consumers: 20
# Maximum number of batches kept in memory before dropping; ignored if enabled is false User should calculate this as num_seconds * requests_per_second / requests_per_batch where:
# * num_seconds is the number of seconds to buffer in case of a backend outage
# * requests_per_second is the average number of requests per seconds
# * requests_per_batch is the average number of requests per batch (if the batch processor is used, the metric send_batch_size can be used for estimation)
queue_size: 1000
# When enabled sending_queue of metrics collector will be offloaded to disk. It will use emptyDir volume
# This will reduce amount of memory, but it will use slightly more CPU and will have slightly lower throughput
offload_to_disk: false
retry_on_failure:
enabled: true
# Time to wait after the first failure before retrying; ignored if enabled is false
initial_interval: 10s
# Is the upper bound on backoff; ignored if enabled is false
max_interval: 30s
# Is the maximum amount of time spent trying to send a batch; ignored if enabled is false
max_elapsed_time: 300s
prometheus:
# URL of prometheus where to scrape
url: ""
# Prometheus URL scheme. It can take the values `http` or `https`
scheme: http
# How often the metrics are scraped from Prometheus
scrape_interval: 60s
# Time to wait per individual attempt to send data to SWO
timeout: 15s
# Configuration for endpoint on which metrics collector receives OpenTelemetry metrics
otlp_endpoint:
port: 4317
kube-state-metrics:
# URL of kube-state-metrics where to scrape
url: ""
# Prometheus URL scheme. It can take the values `http` or `https`
scheme: http
# How often the metrics are scraped from Prometheus
scrape_interval: 60s
# This filter is applied after metric processing, it is the place where metrics could be filtered out
# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor for configuration reference
filter:
exclude:
match_type: regexp
metric_names:
- .*_temp
- apiserver_request_total
# Use this configuration to scrape extra metrics from Prometheus. Multiple metrics can be specified.
# See format in https://prometheus.io/docs/prometheus/latest/querying/basics/#instant-vector-selectors
extra_scrape_metrics: []
# In case `otel.metrics.autodiscovery.prometheusEndpoints.enabled` is set to `true` (which is by default) there is a possibility
# that those extra prometheus metrics are scraped by the collector, so in this case `extra_scrape_metrics` is ignored. By setting
# `force_extra_scrape_metrics` to `true` you can force the collector to scrape those metrics.
force_extra_scrape_metrics: false
# Batching configuration for metrics
# see https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor for configuration reference
batch:
send_batch_size: 512
send_batch_max_size: 512
timeout: 1s
# Memory limiter configuration. The memory limiter is used to prevent out of memory situations on the collector.
# See https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/memorylimiterprocessor for configuration reference
memory_limiter:
check_interval: 1s
limit_mib: 2560
spike_limit_mib: 512
# Memory Ballast enables applications to configure memory ballast for the process.
# See https://github.com/open-telemetry/opentelemetry-collector/tree/main/extension/ballastextension for configuration reference
memory_ballast:
size_mib: 700
# Resource configuration for singleton collector
resources:
requests:
memory: 3Gi
limits:
# override if your singleton collector is being OOM-killed.
memory: 3Gi
# k8s_instrumentation controls the automatic extraction of Kubernetes metadata from resources.
# It instruments OpenTelemetry (OTEL) resources being sent.
k8s_instrumentation:
labels:
# Set 'enabled' to true to instrument Kubernetes labels.
enabled: true
# Provide a regular expression pattern to exclude specific labels from instrumentation.
# Example: To exclude labels with 'internal' or 'private' in their names, use the following pattern:
# excludePattern: ".*internal.*|.*private.*"
excludePattern: ""
annotations:
# Set 'enabled' to true to instrument Kubernetes annotations.
enabled: true
# Provide a regular expression pattern to exclude specific annotations from instrumentation.
# Example: To exclude annotations with 'internal' or 'private' in their names, use the following pattern:
# excludePattern: ".*internal.*|.*private.*"
excludePattern: ""
# Telemetry information of the collector
# see https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/troubleshooting.md#observability for configuration reference
telemetry:
logs:
enabled: true
level: "info"
metrics:
enabled: true
address: 0.0.0.0:8888
podMonitor:
# Create a `PodMonitor` to collect Prometheus metrics.
enabled: false
# Additional labels
additionalLabels: {}
# key: value
# Override namespace (default is the same as K8s collector)
namespace:
# Interval to scrape metrics
interval: 60s
# Timeout if metrics can't be retrieved in given time interval
scrapeTimeout: 25s
# Scheduling configurations
# By default: set to run on linux amd64 nodes
nodeSelector: {}
tolerations: []
affinity: {}
terminationGracePeriodSeconds: 600
readinessProbe:
initialDelaySeconds: 10
livenessProbe:
initialDelaySeconds: 10
# Configuration for Events collection
events:
# Define whether events will be collected and sent
enabled: true
# This filter is applied after events processing, it is the place where events could be filtered out
# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor for configuration reference
# filter:
# Batching configuration for events
# see https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor for configuration reference
batch:
send_batch_size: 512
send_batch_max_size: 512
timeout: 1s
sending_queue:
enabled: true
# Number of consumers that dequeue batches; ignored if enabled is false
num_consumers: 10
# Maximum number of batches kept in memory before dropping; ignored if enabled is false User should calculate this as num_seconds * requests_per_second / requests_per_batch where:
# * num_seconds is the number of seconds to buffer in case of a backend outage
# * requests_per_second is the average number of requests per seconds
# * requests_per_batch is the average number of requests per batch (if the batch processor is used, the metric send_batch_size can be used for estimation)
queue_size: 1000
# When enabled sending_queue of metrics collector will be offloaded to disk. It will use emptyDir volume
# This will reduce amount of memory, but it will use slightly more CPU and will have slightly lower throughput
offload_to_disk: false
retry_on_failure:
enabled: true
# Time to wait after the first failure before retrying; ignored if enabled is false
initial_interval: 10s
# Is the upper bound on backoff; ignored if enabled is false
max_interval: 30s
# Is the maximum amount of time spent trying to send a batch; ignored if enabled is false
max_elapsed_time: 300s
# Time to wait per individual attempt to send data to SWO
timeout: 15s
# Memory limiter configuration. The memory limiter is used to prevent out of memory situations on the collector.
# See https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/memorylimiterprocessor for configuration reference
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
# Memory Ballast enables applications to configure memory ballast for the process.
# See https://github.com/open-telemetry/opentelemetry-collector/tree/main/extension/ballastextension for configuration reference
memory_ballast:
size_mib: 300
# Resource configuration
resources:
requests:
memory: 1000Mi
limits:
memory: 1000Mi
# Telemetry information of the collector
# see https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/troubleshooting.md#observability for configuration reference
telemetry:
logs:
enabled: true
level: "info"
metrics:
enabled: true
address: 0.0.0.0:8888
podMonitor:
# Create a `PodMonitor` to collect Prometheus metrics.
enabled: false
# Additional labels
additionalLabels: {}
# key: value
# Override namespace (default is the same as K8s collector)
namespace:
# Interval to scrape metrics
interval: 60s
# Timeout if metrics can't be retrieved in given time interval
scrapeTimeout: 25s
# k8s_instrumentation controls the automatic extraction of Kubernetes metadata from resources.
# It instruments OpenTelemetry (OTEL) resources being sent.
k8s_instrumentation:
labels:
# Set 'enabled' to true to instrument Kubernetes labels.
enabled: true
# Provide a regular expression pattern to exclude specific labels from instrumentation.
# Example: To exclude labels with 'internal' or 'private' in their names, use the following pattern:
# excludePattern: ".*internal.*|.*private.*"
excludePattern: ""
annotations:
# Set 'enabled' to true to instrument Kubernetes annotations.
enabled: false
# Provide a regular expression pattern to exclude specific annotations from instrumentation.
# Example: To exclude annotations with 'internal' or 'private' in their names, use the following pattern:
# excludePattern: ".*internal.*|.*private.*"
excludePattern: ""
# Scheduling configurations
# By default: set to run on linux amd64 nodes
nodeSelector: {}
tolerations: []
affinity: {}
terminationGracePeriodSeconds: 600
# Configuration for Logs collection
logs:
# Define whether logs will be collected and sent
enabled: true
# If true, the journal logs on nodes will be collected
# Each log has following attributes so they can be filtered out by them using filter configuration:
# * sw.k8s.log.type=journal
# * k8s.cluster.name - name of the cluster (input generated during onboarding)
# * sw.k8s.cluster.uid - UUID of the cluster (input generated during onboarding)
# * sw.k8s.agent.manifest.version - version of the manifest
# * k8s.node.name - node from which the journal logs are coming from
journal: true
# If true, the container logs will be collected
# Log collection uses `filelog` OTEL receiver under the hood
# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver
# Each log has following attributes so they can be filtered out by them using filter configuration:
# * sw.k8s.log.type=container
# * k8s.cluster.name - name of the cluster (input generated during onboarding)
# * sw.k8s.cluster.uid - name of the cluster (input generated during onboarding)
# * sw.k8s.agent.manifest.version - version of the manifest
# * k8s.node.name - node from which the container logs are coming from
# * k8s.container.name - name of the container that is reporting logs
# * k8s.namespace.name - namespace of the container
# * k8s.pod.name - pod of the container
# * run_id - id of the container run
# * k8s.pod.uid - pod's uid
container: true
# This filter is applied after initial log processing, it is the place where logs could be filtered out
# see https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor for configuration reference
filter:
include:
match_type: regexp
# a log has to match all expressions in the list to be included
# see https://github.com/google/re2/wiki/Syntax for regexp syntax
record_attributes:
# allow only system namespaces (kube-system, kube-public)
- key: k8s.namespace.name
value: ^kube-.*$
# Batching configuration for logs
# see https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor for configuration reference
batch:
send_batch_size: 512
send_batch_max_size: 512
timeout: 1s
# k8s_instrumentation controls the automatic extraction of Kubernetes metadata from resources.
# It instruments OpenTelemetry (OTEL) resources being sent.
k8s_instrumentation:
labels:
# Set 'enabled' to true to instrument Kubernetes labels.
enabled: true
# Provide a regular expression pattern to exclude specific labels from instrumentation.
# Example: To exclude labels with 'internal' or 'private' in their names, use the following pattern:
# excludePattern: ".*internal.*|.*private.*"
# by default all labels are excluded
excludePattern: "k8s\\.\\w+\\.labels\\..*"
annotations:
# Set 'enabled' to true to instrument Kubernetes annotations.
enabled: true
# Provide a regular expression pattern to exclude specific annotations from instrumentation.
# Example: To exclude annotations with 'internal' or 'private' in their names, use the following pattern:
# excludePattern: ".*internal.*|.*private.*"
# by default all annotations are excluded
excludePattern: "k8s\\.\\w+\\.annotations\\..*"
# Telemetry information of the collector
# see https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/troubleshooting.md#observability for configuration reference
telemetry:
logs:
enabled: true
level: "info"
metrics:
enabled: true
address: 0.0.0.0:8888
podMonitor:
# Create a `PodMonitor` to collect Prometheus metrics.
enabled: false
# Additional labels
additionalLabels: {}
# key: value
# Override namespace (default is the same as K8s collector)
namespace:
# Interval to scrape metrics
interval: 60s
# Timeout if metrics can't be retrieved in given time interval
scrapeTimeout: 25s
# Memory limiter configuration. The memory limiter is used to prevent out of memory situations on the collector.
# see https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/memorylimiterprocessor for configuration reference
memory_limiter:
check_interval: 1s
limit_mib: 800
spike_limit_mib: 300
# Memory Ballast enables applications to configure memory ballast for the process.
# See https://github.com/open-telemetry/opentelemetry-collector/tree/main/extension/ballastextension for configuration reference
# memory_ballast:
# size_mib: 200
# Resource configuration for Log collector
resources:
requests:
memory: 50Mi
limits:
memory: 1Gi
# Scheduling configurations
nodeSelector: {}
# By default: tolerations allow the DaemonSet to be deployed on tainted nodes so that we can also collect logs from those nodes.
tolerations: []
# By default: affinity is set to run the DaemonSet on linux amd64.
affinity: {}
# Properties that can be configured on filelog reciever. For full description of properties
# see https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver
receiver:
start_at: end
poll_interval: 200ms
max_concurrent_files: 10
encoding: utf-8
fingerprint_size: 1kb
max_log_size: 1MiB
# Properties that can be configured on filestorage that is used to persist log checkpoints.
# see https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver
filestorage:
directory: /var/lib/swo/checkpoints
timeout: 5s
cluster:
name: <CLUSTER_NAME>
uid: <CLUSTER_UID>
# If enabled it creates CronJob that will periodically check for new versions of the Helm chart and upgrade if available
# Keep in mind that in order to update resources the job has full access to the namespace where it is deployed and also have access to modify ClusterRole and ClusterRolBinding
autoupdate:
enabled: false
# How often the update will be checked. See https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#writing-a-cronjob-spec
schedule: "@daily"
# Whether it should check for pre-release versions
devel: false
image:
repository: "alpine/k8s"
tag: "1.27.8"
pullPolicy: IfNotPresent
# Set labels to every deployed resource
# commonLabels:
kube-state-metrics:
enabled: true
service:
annotations:
opencost.scrape: "true"
prometheusScrape: false
nodeSelector:
kubernetes.io/os: linux
# Node exporter is deployed only in case opencost section is enabled. Otherwise this section can be ignored.
prometheus-node-exporter:
service:
port: 9101
targetPort: 9101
annotations:
opencost.scrape: "true"
prometheus.io/scrape: "false"
rbac:
pspEnabled: false
nodeSelector:
kubernetes.io/os: linux
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: eks.amazonaws.com/compute-type
operator: NotIn
values:
- fargate
swoagent:
# Whether the SWO Agent should be deployed as part of this chart.
# If not, integrations are not available.
enabled: false
image:
repository: solarwinds/swo-agent
tag: "v2.6.28"
pullPolicy: IfNotPresent
resources:
limits:
memory: 800Mi
requests:
memory: 800Mi
cpu: 100m
nodeSelector: {}
affinity: {}
aws_fargate:
# Enable support for AWS EKS Fargate environment
enabled: false
# Configuration for Logs collection
logs:
# Enable deployment of AWS FluentBit to the Fargate cluster
enabled: false
# AWS region where the Fargate cluster is running
region:
# Include additional FluentBit filters
# see https://docs.fluentbit.io/manual/pipeline/filters
# NOTE: The FluentBit configuration expects four spaces as indentation within sections
filters: ""
ebpfNetworkMonitoring:
enabled: false
kernelCollector:
enabled: true
telemetry:
logs:
level: "warning"
image:
repository: "otel/opentelemetry-ebpf-kernel-collector"
tag: "v0.10.1"
pullPolicy: IfNotPresent
resources:
requests:
memory: 50Mi
# Scheduling configurations
nodeSelector: {}
# By default: tolerations allow the DaemonSet to be deployed on tainted nodes so that we can also collect logs from those nodes.
tolerations: []
# By default: affinity is set to run the DaemonSet on linux amd64.
affinity: {}
k8sCollector:
enabled: true
telemetry:
logs:
level: "warning"
watcher:
image:
repository: "otel/opentelemetry-ebpf-k8s-watcher"
tag: "v0.10.1"
pullPolicy: IfNotPresent
relay:
image:
repository: "otel/opentelemetry-ebpf-k8s-relay"
tag: "v0.10.1"
pullPolicy: IfNotPresent
reducer:
disableMetrics: []
enableMetrics: []
# Port on which the reducer will listen for metrics from kernelCollector and k8sCollector
telemetryPort: 7000
# Enables id-id time-series generation. The id-id time-series carry the lowest-level information but are of the greatest volume and cardinality, so are disabled by default.
# This also adds IP address to ingested metrics which is useful for identification which Pod from the workload is causing the traffic.
enableIdIdGeneration: false
# At present, scaling the reducer is a manual try-and-see task. The reducer runs a data processing pipeline separated into three stages – ingest, matching and aggregation.
# Usually, the best approach is to scale all the stages by the same factor. Keep in mind that each shard consumes a certain amount of memory, whether it is heavily loaded or not.
# Read more info about architecture https://github.com/open-telemetry/opentelemetry-ebpf/blob/main/docs/reducer/architecture.md
numIngestShards: 3
numMatchingShards: 3
numAggregationShards: 3
telemetry:
logs:
level: "warning"
metrics:
enabled: false
image:
repository: "otel/opentelemetry-ebpf-reducer"
tag: "v0.10.1"
pullPolicy: IfNotPresent
# Prometheus is deployed only in case opencost section is enabled. Otherwise this section can be ignored.
prometheus:
alertmanager:
enabled: false
prometheus-node-exporter:
enabled: false
prometheus-pushgateway:
enabled: false
kube-state-metrics:
enabled: false
serverFiles:
prometheus.yml:
rule_files:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
## Below two files are DEPRECATED will be removed from this default values file
- /etc/config/rules
- /etc/config/alerts
scrape_configs:
- job_name: 'kubernetes-nodes-cadvisor'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
# This configuration will work only on kubelet 1.7.3+
# As the scrape endpoints for cAdvisor have changed
# if you are using older version you need to change the replacement to
# replacement: /api/v1/nodes/$1:4194/proxy/metrics
# more info here https://github.com/coreos/prometheus-operator/issues/633
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
metric_relabel_configs:
- source_labels: [ __name__ ]
regex: (container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_errors_total|container_network_transmit_errors_total|container_network_receive_packets_dropped_total|container_network_transmit_packets_dropped_total|container_memory_usage_bytes|container_cpu_cfs_throttled_periods_total|container_cpu_cfs_periods_total|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_periods_total|container_fs_inodes_free|container_fs_inodes_total|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_throttled_periods_total|container_cpu_cfs_periods_total|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_fs_inodes_free|container_fs_inodes_total|container_fs_usage_bytes|container_fs_limit_bytes|container_spec_cpu_shares|container_spec_memory_limit_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_fs_reads_bytes_total|container_network_receive_bytes_total|container_fs_writes_bytes_total|container_fs_reads_bytes_total|cadvisor_version_info|kubecost_pv_info)
action: keep
- source_labels: [ container ]
target_label: container_name
regex: (.+)
action: replace
- source_labels: [ pod ]
target_label: pod_name
regex: (.+)
action: replace
- job_name: 'kubernetes-nodes'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics
metric_relabel_configs:
- source_labels: [ __name__ ]
regex: (kubelet_volume_stats_used_bytes) # this metric is in alpha
action: keep
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_opencost_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: kubernetes_node
metric_relabel_configs:
- source_labels: [ __name__ ]
regex: (container_cpu_allocation|container_cpu_usage_seconds_total|container_fs_limit_bytes|container_fs_writes_bytes_total|container_gpu_allocation|container_memory_allocation_bytes|container_memory_usage_bytes|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|DCGM_FI_DEV_GPU_UTIL|deployment_match_labels|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_ready|kube_deployment_spec_replicas|kube_deployment_status_replicas|kube_deployment_status_replicas_available|kube_job_status_failed|kube_namespace_annotations|kube_namespace_labels|kube_node_info|kube_node_labels|kube_node_status_allocatable|kube_node_status_allocatable_cpu_cores|kube_node_status_allocatable_memory_bytes|kube_node_status_capacity|kube_node_status_capacity_cpu_cores|kube_node_status_capacity_memory_bytes|kube_node_status_condition|kube_persistentvolume_capacity_bytes|kube_persistentvolume_status_phase|kube_persistentvolumeclaim_info|kube_persistentvolumeclaim_resource_requests_storage_bytes|kube_pod_container_info|kube_pod_container_resource_limits|kube_pod_container_resource_limits_cpu_cores|kube_pod_container_resource_limits_memory_bytes|kube_pod_container_resource_requests|kube_pod_container_resource_requests_cpu_cores|kube_pod_container_resource_requests_memory_bytes|kube_pod_container_status_restarts_total|kube_pod_container_status_running|kube_pod_container_status_terminated_reason|kube_pod_labels|kube_pod_owner|kube_pod_status_phase|kube_replicaset_owner|kube_statefulset_replicas|kube_statefulset_status_replicas|kubecost_cluster_info|kubecost_cluster_management_cost|kubecost_cluster_memory_working_set_bytes|kubecost_load_balancer_cost|kubecost_network_internet_egress_cost|kubecost_network_region_egress_cost|kubecost_network_zone_egress_cost|kubecost_node_is_spot|kubecost_pod_network_egress_bytes_total|node_cpu_hourly_cost|node_cpu_seconds_total|node_disk_reads_completed|node_disk_reads_completed_total|node_disk_writes_completed|node_disk_writes_completed_total|node_filesystem_device_error|node_gpu_count|node_gpu_hourly_cost|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_MemAvailable_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_network_transmit_bytes_total|node_ram_hourly_cost|node_total_hourly_cost|pod_pvc_allocation|pv_hourly_cost|service_selector_labels|statefulSet_match_labels|kubecost_pv_info|up)
action: keep
server:
# Disabling persistent volume to make the chart compatible with new EKS clusters
# (this matches the behavior of `kube-prometheus-stack` chart).
persistentVolume:
enabled: false
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: eks.amazonaws.com/compute-type
operator: NotIn
values:
- fargate
nodeSelector:
kubernetes.io/os: linux
opencost:
enabled: false
opencost:
nodeSelector:
kubernetes.io/os: linux
prometheus:
internal:
enabled: true
port: 80
namespaceName: "{{ .Release.Namespace }}"
serviceName: "{{ .Release.Name }}-prometheus-server"
ui:
enabled: false