deploy/helm/values.yaml

# This values file provides the default values for the chart. Placeholders like
# <CLUSTER_NAME> will be provided via the values.yaml provided during the cluster
# onboarding process--or you can provide your own with the appropriate values
# given to you during initial onboarding for an initial cluster and region.

nameOverride: ""
fullnameOverride: ""

# array of pair "name: secret"
imagePullSecrets: []

otel:
  # e.g. otel.collector.na-01.cloud.solarwinds.com:443
  # A list of all available OTEL endpoints: https://documentation.solarwinds.com/en/success_center/observability/content/system_requirements/endpoints.htm
  endpoint: <OTEL_ENVOY_ADDRESS>
  tls_insecure: false

  # SWO API token used for authentication. If filled it will create secret that will be used by the collector
  api_token: ""

  # The OTEL collector supports an HTTPS proxy. Specify the full URL of the HTTPS
  # proxy here. e.g. https_proxy: "https://myproxy.mydomain.com:8080"
  https_proxy_url: ""

  image:
    repository: solarwinds/swi-opentelemetry-collector
    # if not set appVersion field from Chart.yaml is used
    tag: ""
    pullPolicy: IfNotPresent

  windows:
    enabled: true
    image:
      repository: solarwinds/swi-opentelemetry-collector
      # if not set appVersion field from Chart.yaml is used
      tag: ""
      pullPolicy: IfNotPresent

  init_images:
    swi_endpoint_check:
      repository: "fullstorydev/grpcurl"
      tag: "v1.8.9"
      pullPolicy: IfNotPresent

    busy_box:
      repository: "busybox"
      tag: "1.36.1"
      pullPolicy: IfNotPresent

  node_collector:
    terminationGracePeriodSeconds: 600

    sending_queue:
      enabled: true

      # Number of consumers that dequeue batches; ignored if enabled is false
      num_consumers: 20

      # Maximum number of batches kept in memory before dropping; ignored if enabled is false User should calculate this as num_seconds * requests_per_second / requests_per_batch where:
      # * num_seconds is the number of seconds to buffer in case of a backend outage
      # * requests_per_second is the average number of requests per seconds
      # * requests_per_batch is the average number of requests per batch (if the batch processor is used, the metric send_batch_size can be used for estimation)
      queue_size: 1000

      # Configuration for persistent data storage of sending queue of node-collector
      # For detailed explanation see https://github.com/open-telemetry/opentelemetry-collector/blob/main/exporter/exporterhelper/README.md#persistent-queue
      persistent_storage:
        enabled: false
        directory: /var/lib/swo/sending_queue

    retry_on_failure:
      enabled: true

      # Time to wait after the first failure before retrying; ignored if enabled is false
      initial_interval: 10s

      # Is the upper bound on backoff; ignored if enabled is false
      max_interval: 30s

      # Is the maximum amount of time spent trying to send a batch; ignored if enabled is false
      max_elapsed_time: 300s

    # Time to wait per individual attempt to send data to SWO
    timeout: 15s

  # Configuration for metrics collection
  metrics:
    # Define whether metrics will be collected and sent
    enabled: true

    # configuration for metric discovery
    autodiscovery:
      prometheusEndpoints:
        # Define whether metrics will be discovered and scraped by prometheus annotations
        enabled: true

        # Additional custom rule for discovery (following rule is always present: type == "pod" && annotations["prometheus.io/scrape"] == "true")
        # Available fields
        #   `id` - ID of source endpoint
        #   `name` - name of the pod
        #   `namespace` - namespace of the pod
        #   `uid` - unique id of the pod
        #   `labels` - map of labels set on the pod
        #   `annotations` - map of annotations set on the pod
        # Example: namespace == "test-namespace" && labels["app"] == "test-app"
        additionalRules: ""

        podMonitors:
          rules: []
        # Additional custom rules for discovery
        # Available fields for podMonitors rules are the same as for additionalRules

        #    - rule: labels["example"] == "value"
        #      metrics_path: "/metrics"
        #      endpoint_port: 8080

        customTransformations:
          # list of metrics that are counters should be converted to rate
          counterToRate: []

        # This filter is applied after metric processing, it is the place where metrics could be filtered out
        # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor for configuration reference
        filter: {}

    # Check if SWI OTEL endpoint is reachable
    swi_endpoint_check: true

    # Check if Prometheus endpoint (provided by otel.metrics.prometheus.url) is reachable
    prometheus_check: false

    sending_queue:
      enabled: true

      # Number of consumers that dequeue batches; ignored if enabled is false
      num_consumers: 20

      # Maximum number of batches kept in memory before dropping; ignored if enabled is false User should calculate this as num_seconds * requests_per_second / requests_per_batch where:
      # * num_seconds is the number of seconds to buffer in case of a backend outage
      # * requests_per_second is the average number of requests per seconds
      # * requests_per_batch is the average number of requests per batch (if the batch processor is used, the metric send_batch_size can be used for estimation)
      queue_size: 1000

      # When enabled sending_queue of metrics collector will be offloaded to disk. It will use emptyDir volume
      # This will reduce amount of memory, but it will use slightly more CPU and will have slightly lower throughput
      offload_to_disk: false

    retry_on_failure:
      enabled: true

      # Time to wait after the first failure before retrying; ignored if enabled is false
      initial_interval: 10s

      # Is the upper bound on backoff; ignored if enabled is false
      max_interval: 30s

      # Is the maximum amount of time spent trying to send a batch; ignored if enabled is false
      max_elapsed_time: 300s

    prometheus:
      # URL of prometheus where to scrape
      url: ""

      # Prometheus URL scheme. It can take the values `http` or `https`
      scheme: http

      # How often the metrics are scraped from Prometheus
      scrape_interval: 60s

    # Time to wait per individual attempt to send data to SWO
    timeout: 15s

    # Configuration for endpoint on which metrics collector receives OpenTelemetry metrics
    otlp_endpoint:
      port: 4317

    kube-state-metrics:
      # URL of kube-state-metrics where to scrape
      url: ""

      # Prometheus URL scheme. It can take the values `http` or `https`
      scheme: http

      # How often the metrics are scraped from Prometheus
      scrape_interval: 60s

    # This filter is applied after metric processing, it is the place where metrics could be filtered out
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor for configuration reference
    filter:
      exclude:
        match_type: regexp
        metric_names:
          - .*_temp
          - apiserver_request_total

    # Use this configuration to scrape extra metrics from Prometheus. Multiple metrics can be specified.
    # See format in https://prometheus.io/docs/prometheus/latest/querying/basics/#instant-vector-selectors
    extra_scrape_metrics: []

    # In case `otel.metrics.autodiscovery.prometheusEndpoints.enabled` is set to `true` (which is by default) there is a possibility
    # that those extra prometheus metrics are scraped by the collector, so in this case `extra_scrape_metrics` is ignored. By setting
    # `force_extra_scrape_metrics` to `true` you can force the collector to scrape those metrics.
    force_extra_scrape_metrics: false

    # Batching configuration for metrics
    # see https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor for configuration reference
    batch:
      send_batch_size: 512
      send_batch_max_size: 512
      timeout: 1s

    # Memory limiter configuration. The memory limiter is used to prevent out of memory situations on the collector.
    # See https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/memorylimiterprocessor for configuration reference
    memory_limiter:
      check_interval: 1s
      limit_mib: 2560
      spike_limit_mib: 512

    # Memory Ballast enables applications to configure memory ballast for the process.
    # See https://github.com/open-telemetry/opentelemetry-collector/tree/main/extension/ballastextension for configuration reference
    memory_ballast:
      size_mib: 700

    # Resource configuration for singleton collector
    resources:
      requests:
        memory: 3Gi
      limits:
        # override if your singleton collector is being OOM-killed.
        memory: 3Gi

    # k8s_instrumentation controls the automatic extraction of Kubernetes metadata from resources.
    # It instruments OpenTelemetry (OTEL) resources being sent.
    k8s_instrumentation:
      labels:
        # Set 'enabled' to true to instrument Kubernetes labels.
        enabled: true

        # Provide a regular expression pattern to exclude specific labels from instrumentation.
        # Example: To exclude labels with 'internal' or 'private' in their names, use the following pattern:
        # excludePattern: ".*internal.*|.*private.*"
        excludePattern: ""

      annotations:
        # Set 'enabled' to true to instrument Kubernetes annotations.
        enabled: true

        # Provide a regular expression pattern to exclude specific annotations from instrumentation.
        # Example: To exclude annotations with 'internal' or 'private' in their names, use the following pattern:
        # excludePattern: ".*internal.*|.*private.*"
        excludePattern: ""

    # Telemetry information of the collector
    # see https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/troubleshooting.md#observability for configuration reference
    telemetry:
      logs:
        enabled: true
        level: "info"
      metrics:
        enabled: true
        address: 0.0.0.0:8888
        podMonitor:
          # Create a `PodMonitor` to collect Prometheus metrics.
          enabled: false

          # Additional labels
          additionalLabels: {}
          # key: value

          # Override namespace (default is the same as K8s collector)
          namespace:

          # Interval to scrape metrics
          interval: 60s

          # Timeout if metrics can't be retrieved in given time interval
          scrapeTimeout: 25s

    # Scheduling configurations
    # By default: set to run on linux amd64 nodes
    nodeSelector: {}
    tolerations: []
    affinity: {}
    terminationGracePeriodSeconds: 600

    readinessProbe:
      initialDelaySeconds: 10
    livenessProbe:
      initialDelaySeconds: 10

  # Configuration for Events collection
  events:
    # Define whether events will be collected and sent
    enabled: true

    # This filter is applied after events processing, it is the place where events could be filtered out
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor for configuration reference
    # filter:

    # Batching configuration for events
    # see https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor for configuration reference
    batch:
      send_batch_size: 512
      send_batch_max_size: 512
      timeout: 1s

    sending_queue:
      enabled: true

      # Number of consumers that dequeue batches; ignored if enabled is false
      num_consumers: 10

      # Maximum number of batches kept in memory before dropping; ignored if enabled is false User should calculate this as num_seconds * requests_per_second / requests_per_batch where:
      # * num_seconds is the number of seconds to buffer in case of a backend outage
      # * requests_per_second is the average number of requests per seconds
      # * requests_per_batch is the average number of requests per batch (if the batch processor is used, the metric send_batch_size can be used for estimation)
      queue_size: 1000

      # When enabled sending_queue of metrics collector will be offloaded to disk. It will use emptyDir volume
      # This will reduce amount of memory, but it will use slightly more CPU and will have slightly lower throughput
      offload_to_disk: false

    retry_on_failure:
      enabled: true

      # Time to wait after the first failure before retrying; ignored if enabled is false
      initial_interval: 10s

      # Is the upper bound on backoff; ignored if enabled is false
      max_interval: 30s

      # Is the maximum amount of time spent trying to send a batch; ignored if enabled is false
      max_elapsed_time: 300s

    # Time to wait per individual attempt to send data to SWO
    timeout: 15s

    # Memory limiter configuration. The memory limiter is used to prevent out of memory situations on the collector.
    # See https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/memorylimiterprocessor for configuration reference
    memory_limiter:
      check_interval: 1s
      limit_mib: 512
      spike_limit_mib: 128

    # Memory Ballast enables applications to configure memory ballast for the process.
    # See https://github.com/open-telemetry/opentelemetry-collector/tree/main/extension/ballastextension for configuration reference
    memory_ballast:
      size_mib: 300

    # Resource configuration
    resources:
      requests:
        memory: 1000Mi
      limits:
        memory: 1000Mi

    # Telemetry information of the collector
    # see https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/troubleshooting.md#observability for configuration reference
    telemetry:
      logs:
        enabled: true
        level: "info"
      metrics:
        enabled: true
        address: 0.0.0.0:8888
        podMonitor:
          # Create a `PodMonitor` to collect Prometheus metrics.
          enabled: false

          # Additional labels
          additionalLabels: {}
          # key: value

          # Override namespace (default is the same as K8s collector)
          namespace:

          # Interval to scrape metrics
          interval: 60s

          # Timeout if metrics can't be retrieved in given time interval
          scrapeTimeout: 25s

    # k8s_instrumentation controls the automatic extraction of Kubernetes metadata from resources.
    # It instruments OpenTelemetry (OTEL) resources being sent.
    k8s_instrumentation:
      labels:
        # Set 'enabled' to true to instrument Kubernetes labels.
        enabled: true

        # Provide a regular expression pattern to exclude specific labels from instrumentation.
        # Example: To exclude labels with 'internal' or 'private' in their names, use the following pattern:
        # excludePattern: ".*internal.*|.*private.*"
        excludePattern: ""

      annotations:
        # Set 'enabled' to true to instrument Kubernetes annotations.
        enabled: false

        # Provide a regular expression pattern to exclude specific annotations from instrumentation.
        # Example: To exclude annotations with 'internal' or 'private' in their names, use the following pattern:
        # excludePattern: ".*internal.*|.*private.*"
        excludePattern: ""

    # Scheduling configurations
    # By default: set to run on linux amd64 nodes
    nodeSelector: {}
    tolerations: []
    affinity: {}

    terminationGracePeriodSeconds: 600

  # Configuration for Logs collection
  logs:
    # Define whether logs will be collected and sent
    enabled: true

    # If true, the journal logs on nodes will be collected
    # Each log has following attributes so they can be filtered out by them using filter configuration:
    #   * sw.k8s.log.type=journal
    #   * k8s.cluster.name - name of the cluster (input generated during onboarding)
    #   * sw.k8s.cluster.uid - UUID of the cluster (input generated during onboarding)
    #   * sw.k8s.agent.manifest.version - version of the manifest
    #   * k8s.node.name - node from which the journal logs are coming from
    journal: true

    # If true, the container logs will be collected
    # Log collection uses `filelog` OTEL receiver under the hood
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver
    # Each log has following attributes so they can be filtered out by them using filter configuration:
    #   * sw.k8s.log.type=container
    #   * k8s.cluster.name - name of the cluster (input generated during onboarding)
    #   * sw.k8s.cluster.uid - name of the cluster (input generated during onboarding)
    #   * sw.k8s.agent.manifest.version - version of the manifest
    #   * k8s.node.name - node from which the container logs are coming from
    #   * k8s.container.name - name of the container that is reporting logs
    #   * k8s.namespace.name - namespace of the container
    #   * k8s.pod.name - pod of the container
    #   * run_id - id of the container run
    #   * k8s.pod.uid - pod's uid
    container: true

    # This filter is applied after initial log processing, it is the place where logs could be filtered out
    # see https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor for configuration reference
    filter:
      include:
        match_type: regexp
        # a log has to match all expressions in the list to be included
        # see https://github.com/google/re2/wiki/Syntax for regexp syntax
        record_attributes:
          # allow only system namespaces (kube-system, kube-public)
          - key: k8s.namespace.name
            value: ^kube-.*$

    # Batching configuration for logs
    # see https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor for configuration reference
    batch:
      send_batch_size: 512
      send_batch_max_size: 512
      timeout: 1s

    # k8s_instrumentation controls the automatic extraction of Kubernetes metadata from resources.
    # It instruments OpenTelemetry (OTEL) resources being sent.
    k8s_instrumentation:
      labels:
        # Set 'enabled' to true to instrument Kubernetes labels.
        enabled: true

        # Provide a regular expression pattern to exclude specific labels from instrumentation.
        # Example: To exclude labels with 'internal' or 'private' in their names, use the following pattern:
        # excludePattern: ".*internal.*|.*private.*"
        # by default all labels are excluded
        excludePattern: "k8s\\.\\w+\\.labels\\..*"

      annotations:
        # Set 'enabled' to true to instrument Kubernetes annotations.
        enabled: true

        # Provide a regular expression pattern to exclude specific annotations from instrumentation.
        # Example: To exclude annotations with 'internal' or 'private' in their names, use the following pattern:
        # excludePattern: ".*internal.*|.*private.*"
        # by default all annotations are excluded
        excludePattern: "k8s\\.\\w+\\.annotations\\..*"


    # Telemetry information of the collector
    # see https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/troubleshooting.md#observability for configuration reference
    telemetry:
      logs:
        enabled: true
        level: "info"
      metrics:
        enabled: true
        address: 0.0.0.0:8888
        podMonitor:
          # Create a `PodMonitor` to collect Prometheus metrics.
          enabled: false

          # Additional labels
          additionalLabels: {}
          # key: value

          # Override namespace (default is the same as K8s collector)
          namespace:

          # Interval to scrape metrics
          interval: 60s

          # Timeout if metrics can't be retrieved in given time interval
          scrapeTimeout: 25s

    # Memory limiter configuration. The memory limiter is used to prevent out of memory situations on the collector.
    # see https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/memorylimiterprocessor for configuration reference
    memory_limiter:
      check_interval: 1s
      limit_mib: 800
      spike_limit_mib: 300

    # Memory Ballast enables applications to configure memory ballast for the process.
    # See https://github.com/open-telemetry/opentelemetry-collector/tree/main/extension/ballastextension for configuration reference
    # memory_ballast:
    #   size_mib: 200

    # Resource configuration for Log collector
    resources:
      requests:
        memory: 50Mi
      limits:
        memory: 1Gi

    # Scheduling configurations
    nodeSelector: {}
    # By default: tolerations allow the DaemonSet to be deployed on tainted nodes so that we can also collect logs from those nodes.
    tolerations: []
    # By default: affinity is set to run the DaemonSet on linux amd64.
    affinity: {}

    # Properties that can be configured on filelog reciever. For full description of properties
    # see https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver
    receiver:
      start_at: end
      poll_interval: 200ms
      max_concurrent_files: 10
      encoding: utf-8
      fingerprint_size: 1kb
      max_log_size: 1MiB

    # Properties that can be configured on filestorage that is used to persist log checkpoints.
    # see https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver
    filestorage:
      directory: /var/lib/swo/checkpoints
      timeout: 5s

cluster:
  name: <CLUSTER_NAME>
  uid: <CLUSTER_UID>

# If enabled it creates CronJob that will periodically check for new versions of the Helm chart and upgrade if available
# Keep in mind that in order to update resources the job has full access to the namespace where it is deployed and also have access to modify ClusterRole and ClusterRolBinding
autoupdate:
  enabled: false

  # How often the update will be checked. See https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#writing-a-cronjob-spec
  schedule: "@daily"

  # Whether it should check for pre-release versions
  devel: false

  image:
    repository: "alpine/k8s"
    tag: "1.27.8"
    pullPolicy: IfNotPresent

# Set labels to every deployed resource
# commonLabels:

kube-state-metrics:
  enabled: true
  service:
    annotations:
      opencost.scrape: "true"

  prometheusScrape: false

  nodeSelector:
    kubernetes.io/os: linux

# Node exporter is deployed only in case opencost section is enabled. Otherwise this section can be ignored.
prometheus-node-exporter:
  service:
    port: 9101
    targetPort: 9101
    annotations:
      opencost.scrape: "true"
      prometheus.io/scrape: "false"

  rbac:
    pspEnabled: false

  nodeSelector:
    kubernetes.io/os: linux

  affinity:
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
        - matchExpressions:
          - key: eks.amazonaws.com/compute-type
            operator: NotIn
            values:
            - fargate


swoagent:
  # Whether the SWO Agent should be deployed as part of this chart.
  # If not, integrations are not available.

  enabled: false
  image:
    repository: solarwinds/swo-agent
    tag: "v2.6.28"
    pullPolicy: IfNotPresent
  resources:
    limits:
      memory: 800Mi
    requests:
      memory: 800Mi
      cpu: 100m
  nodeSelector: {}
  affinity: {}

aws_fargate:
  # Enable support for AWS EKS Fargate environment
  enabled: false

  # Configuration for Logs collection
  logs:
    # Enable deployment of AWS FluentBit to the Fargate cluster
    enabled: false

    # AWS region where the Fargate cluster is running
    region:

    # Include additional FluentBit filters
    # see https://docs.fluentbit.io/manual/pipeline/filters
    # NOTE: The FluentBit configuration expects four spaces as indentation within sections
    filters: ""

ebpfNetworkMonitoring:
  enabled: false
  kernelCollector:
    enabled: true

    telemetry:
      logs:
        level: "warning"

    image:
      repository: "otel/opentelemetry-ebpf-kernel-collector"
      tag: "v0.10.1"
      pullPolicy: IfNotPresent

    resources:
      requests:
        memory: 50Mi

    # Scheduling configurations
    nodeSelector: {}
    # By default: tolerations allow the DaemonSet to be deployed on tainted nodes so that we can also collect logs from those nodes.
    tolerations: []
    # By default: affinity is set to run the DaemonSet on linux amd64.
    affinity: {}

  k8sCollector:
    enabled: true

    telemetry:
      logs:
        level: "warning"

    watcher:
      image:
        repository: "otel/opentelemetry-ebpf-k8s-watcher"
        tag: "v0.10.1"
        pullPolicy: IfNotPresent

    relay:
      image:
        repository: "otel/opentelemetry-ebpf-k8s-relay"
        tag: "v0.10.1"
        pullPolicy: IfNotPresent
  reducer:
    disableMetrics: []
    enableMetrics: []

    # Port on which the reducer will listen for metrics from kernelCollector and k8sCollector
    telemetryPort: 7000

    # Enables id-id time-series generation. The id-id time-series carry the lowest-level information but are of the greatest volume and cardinality, so are disabled by default.
    # This also adds IP address to ingested metrics which is useful for identification which Pod from the workload is causing the traffic.
    enableIdIdGeneration: false

    # At present, scaling the reducer is a manual try-and-see task. The reducer runs a data processing pipeline separated into three stages – ingest, matching and aggregation.
    # Usually, the best approach is to scale all the stages by the same factor. Keep in mind that each shard consumes a certain amount of memory, whether it is heavily loaded or not.
    # Read more info about architecture https://github.com/open-telemetry/opentelemetry-ebpf/blob/main/docs/reducer/architecture.md
    numIngestShards: 3
    numMatchingShards: 3
    numAggregationShards: 3

    telemetry:
      logs:
        level: "warning"
      metrics:
        enabled: false

    image:
      repository: "otel/opentelemetry-ebpf-reducer"
      tag: "v0.10.1"
      pullPolicy: IfNotPresent

# Prometheus is deployed only in case opencost section is enabled. Otherwise this section can be ignored.
prometheus:
  alertmanager:
    enabled: false
  prometheus-node-exporter:
    enabled: false
  prometheus-pushgateway:
    enabled: false
  kube-state-metrics:
    enabled: false
  serverFiles:
    prometheus.yml:
      rule_files:
        - /etc/config/recording_rules.yml
        - /etc/config/alerting_rules.yml
        ## Below two files are DEPRECATED will be removed from this default values file
        - /etc/config/rules
        - /etc/config/alerts
      scrape_configs:
      - job_name: 'kubernetes-nodes-cadvisor'

        # Default to scraping over https. If required, just disable this or change to
        # `http`.
        scheme: https

        # This TLS & bearer token file config is used to connect to the actual scrape
        # endpoints for cluster components. This is separate to discovery auth
        # configuration because discovery & scraping are two separate concerns in
        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
        # the cluster. Otherwise, more config options have to be provided within the
        # <kubernetes_sd_config>.
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          # If your node certificates are self-signed or use a different CA to the
          # master CA, then disable certificate verification below. Note that
          # certificate verification is an integral part of a secure infrastructure
          # so this should only be disabled in a controlled environment. You can
          # disable certificate verification by uncommenting the line below.
          #
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        kubernetes_sd_configs:
          - role: node

        # This configuration will work only on kubelet 1.7.3+
        # As the scrape endpoints for cAdvisor have changed
        # if you are using older version you need to change the replacement to
        # replacement: /api/v1/nodes/$1:4194/proxy/metrics
        # more info here https://github.com/coreos/prometheus-operator/issues/633
        relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          - target_label: __address__
            replacement: kubernetes.default.svc:443
          - source_labels: [__meta_kubernetes_node_name]
            regex: (.+)
            target_label: __metrics_path__
            replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor

        metric_relabel_configs:
          - source_labels: [ __name__ ]
            regex: (container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_errors_total|container_network_transmit_errors_total|container_network_receive_packets_dropped_total|container_network_transmit_packets_dropped_total|container_memory_usage_bytes|container_cpu_cfs_throttled_periods_total|container_cpu_cfs_periods_total|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_periods_total|container_fs_inodes_free|container_fs_inodes_total|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_throttled_periods_total|container_cpu_cfs_periods_total|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_fs_inodes_free|container_fs_inodes_total|container_fs_usage_bytes|container_fs_limit_bytes|container_spec_cpu_shares|container_spec_memory_limit_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_fs_reads_bytes_total|container_network_receive_bytes_total|container_fs_writes_bytes_total|container_fs_reads_bytes_total|cadvisor_version_info|kubecost_pv_info)
            action: keep
          - source_labels: [ container ]
            target_label: container_name
            regex: (.+)
            action: replace
          - source_labels: [ pod ]
            target_label: pod_name
            regex: (.+)
            action: replace
      - job_name: 'kubernetes-nodes'

        # Default to scraping over https. If required, just disable this or change to
        # `http`.
        scheme: https

        # This TLS & bearer token file config is used to connect to the actual scrape
        # endpoints for cluster components. This is separate to discovery auth
        # configuration because discovery & scraping are two separate concerns in
        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
        # the cluster. Otherwise, more config options have to be provided within the
        # <kubernetes_sd_config>.
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          # If your node certificates are self-signed or use a different CA to the
          # master CA, then disable certificate verification below. Note that
          # certificate verification is an integral part of a secure infrastructure
          # so this should only be disabled in a controlled environment. You can
          # disable certificate verification by uncommenting the line below.
          #
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        kubernetes_sd_configs:
          - role: node

        relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          - target_label: __address__
            replacement: kubernetes.default.svc:443
          - source_labels: [__meta_kubernetes_node_name]
            regex: (.+)
            target_label: __metrics_path__
            replacement: /api/v1/nodes/$1/proxy/metrics

        metric_relabel_configs:
          - source_labels: [ __name__ ]
            regex: (kubelet_volume_stats_used_bytes) # this metric is in alpha
            action: keep
      # Scrape config for service endpoints.
      #
      # The relabeling allows the actual service scrape endpoint to be configured
      # via the following annotations:
      #
      # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
      # to set this to `https` & most likely set the `tls_config` of the scrape config.
      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
      # service then set this appropriately.
      - job_name: 'kubernetes-service-endpoints'

        kubernetes_sd_configs:
          - role: endpoints

        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_opencost_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
            action: replace
            target_label: __scheme__
            regex: (https?)
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
            action: replace
            target_label: __address__
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            action: replace
            target_label: kubernetes_name
          - source_labels: [__meta_kubernetes_pod_node_name]
            action: replace
            target_label: kubernetes_node
        metric_relabel_configs:
          - source_labels: [ __name__ ]
            regex: (container_cpu_allocation|container_cpu_usage_seconds_total|container_fs_limit_bytes|container_fs_writes_bytes_total|container_gpu_allocation|container_memory_allocation_bytes|container_memory_usage_bytes|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|DCGM_FI_DEV_GPU_UTIL|deployment_match_labels|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_ready|kube_deployment_spec_replicas|kube_deployment_status_replicas|kube_deployment_status_replicas_available|kube_job_status_failed|kube_namespace_annotations|kube_namespace_labels|kube_node_info|kube_node_labels|kube_node_status_allocatable|kube_node_status_allocatable_cpu_cores|kube_node_status_allocatable_memory_bytes|kube_node_status_capacity|kube_node_status_capacity_cpu_cores|kube_node_status_capacity_memory_bytes|kube_node_status_condition|kube_persistentvolume_capacity_bytes|kube_persistentvolume_status_phase|kube_persistentvolumeclaim_info|kube_persistentvolumeclaim_resource_requests_storage_bytes|kube_pod_container_info|kube_pod_container_resource_limits|kube_pod_container_resource_limits_cpu_cores|kube_pod_container_resource_limits_memory_bytes|kube_pod_container_resource_requests|kube_pod_container_resource_requests_cpu_cores|kube_pod_container_resource_requests_memory_bytes|kube_pod_container_status_restarts_total|kube_pod_container_status_running|kube_pod_container_status_terminated_reason|kube_pod_labels|kube_pod_owner|kube_pod_status_phase|kube_replicaset_owner|kube_statefulset_replicas|kube_statefulset_status_replicas|kubecost_cluster_info|kubecost_cluster_management_cost|kubecost_cluster_memory_working_set_bytes|kubecost_load_balancer_cost|kubecost_network_internet_egress_cost|kubecost_network_region_egress_cost|kubecost_network_zone_egress_cost|kubecost_node_is_spot|kubecost_pod_network_egress_bytes_total|node_cpu_hourly_cost|node_cpu_seconds_total|node_disk_reads_completed|node_disk_reads_completed_total|node_disk_writes_completed|node_disk_writes_completed_total|node_filesystem_device_error|node_gpu_count|node_gpu_hourly_cost|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_MemAvailable_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_network_transmit_bytes_total|node_ram_hourly_cost|node_total_hourly_cost|pod_pvc_allocation|pv_hourly_cost|service_selector_labels|statefulSet_match_labels|kubecost_pv_info|up)
            action: keep


  server:
    # Disabling persistent volume to make the chart compatible with new EKS clusters
    # (this matches the behavior of `kube-prometheus-stack` chart).
    persistentVolume:
      enabled: false
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
          - matchExpressions:
            - key: eks.amazonaws.com/compute-type
              operator: NotIn
              values:
              - fargate
    nodeSelector:
      kubernetes.io/os: linux

opencost:
  enabled: false
  opencost:
    nodeSelector:
      kubernetes.io/os: linux
    prometheus:
      internal:
        enabled: true
        port: 80
        namespaceName: "{{ .Release.Namespace }}"
        serviceName: "{{ .Release.Name }}-prometheus-server"
    ui:
      enabled: false