Skip to content

Commit

Permalink
add democratic-csi local-path and loki
Browse files Browse the repository at this point in the history
  • Loading branch information
ryan-mcd committed Jan 5, 2024
1 parent 6897e7d commit b114adc
Show file tree
Hide file tree
Showing 14 changed files with 672 additions and 2 deletions.
2 changes: 2 additions & 0 deletions cluster/apps/monitoring/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ resources:
- speedtest-exporter
- thanos
- network-ups-tools/ks.yaml
- loki/secret.sops.yaml
- loki/ks.yaml
206 changes: 206 additions & 0 deletions cluster/apps/monitoring/loki/app/helm-release.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta2.json
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: loki
spec:
interval: 30m
timeout: 15m
chart:
spec:
chart: loki
version: 5.41.4
sourceRef:
kind: HelmRepository
name: grafana-charts
namespace: flux-system
interval: 30m
values:
loki:
structuredConfig:
auth_enabled: false

server:
log_level: info
http_listen_port: 3100
grpc_listen_port: 9095

grpc_server_max_recv_msg_size: 8388608
grpc_server_max_send_msg_size: 8388608

memberlist:
join_members: ["loki-memberlist"]

limits_config:
retention_period: 14d
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
max_cache_freshness_per_query: 10m
split_queries_by_interval: 15m
ingestion_rate_mb: 50
ingestion_burst_size_mb: 1000
per_stream_rate_limit: 5M
per_stream_rate_limit_burst: 20M
shard_streams:
enabled: true

schema_config:
configs:
- from: "2021-08-01"
store: boltdb-shipper
object_store: s3
schema: v12
index:
prefix: loki_index_
period: 24h

common:
path_prefix: /var/loki
replication_factor: 2
storage:
s3:
s3: null
insecure: true
s3forcepathstyle: true
bucketnames: loki
endpoint: ${S3_BUCKET_HOST}
region: ${S3_BUCKET_REGION}
access_key_id: ${S3_ACCESS_KEY}
secret_access_key: ${S3_SECRET_KEY}
ring:
kvstore:
store: memberlist

ruler:
enable_api: true
enable_alertmanager_v2: true
alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
storage:
type: local
local:
directory: /rules
# rule_path: /rules
rule_path: /tmp/scratch
ring:
kvstore:
store: memberlist

distributor:
ring:
kvstore:
store: memberlist

compactor:
working_directory: /var/loki/boltdb-shipper-compactor
shared_store: s3
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150

ingester:
max_chunk_age: 1h
lifecycler:
ring:
kvstore:
store: memberlist

analytics:
reporting_enabled: false

podAnnotations:
secret.reloader.stakater.com/reload: loki-secret

gateway:
replicas: 2
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
podAffinityTerm:
labelSelector:
matchLabels:
{{- include "loki.gatewaySelectorLabels" . | nindent 12 }}
topologyKey: kubernetes.io/hostname
enabled: true
image:
registry: ghcr.io
repository: nginxinc/nginx-unprivileged
tag: 1.25-alpine
ingress:
enabled: false

read:
replicas: 2
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
podAffinityTerm:
labelSelector:
matchLabels:
{{- include "loki.readSelectorLabels" . | nindent 12 }}
topologyKey: kubernetes.io/hostname
write:
replicas: 2
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
podAffinityTerm:
labelSelector:
matchLabels:
{{- include "loki.writeSelectorLabels" . | nindent 12 }}
topologyKey: kubernetes.io/hostname
persistence:
size: 20Gi
storageClass: local-hostpath

backend:
replicas: 2
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
podAffinityTerm:
labelSelector:
matchLabels:
{{- include "loki.backendSelectorLabels" . | nindent 12 }}
topologyKey: kubernetes.io/hostname
persistence:
size: 20Gi
storageClass: local-hostpath
# extraVolumeMounts:
# - name: rules
# mountPath: /rules/fake
# - name: scratch
# mountPath: /tmp/scratch
# extraVolumes:
# - name: rules
# configMap:
# name: loki-alerting-rules
# - name: scratch
# emptyDir: {}

monitoring:
dashboards:
annotations:
grafana_folder: Loki
serviceMonitor:
enabled: false
metricsInstance:
enabled: false
selfMonitoring:
enabled: false
grafanaAgent:
installOperator: false
lokiCanary:
enabled: false
sidecar:
image:
repository: ghcr.io/kiwigrid/k8s-sidecar
test:
enabled: false
7 changes: 7 additions & 0 deletions cluster/apps/monitoring/loki/app/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./helm-release.yaml
- ./prometheus-rule.yaml
174 changes: 174 additions & 0 deletions cluster/apps/monitoring/loki/app/prometheus-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/monitoring.coreos.com/prometheusrule_v1.json
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: loki.rules
spec:
groups:
- name: loki.rules
rules:
- alert: LokiRequestErrors
annotations:
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job)
> 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
> 1
for: 15m
labels:
severity: critical
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job)
record: job:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
record: job:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds_count:sum_rate
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route)
record: job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_count:sum_rate
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m]))
by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m]))
by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
- name: smart
rules:
- alert: SMARTFailure
expr: |
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
hostname: "{{ $labels.hostname }}"
summary: "{{ $labels.hostname }} has reported SMART failures"

- name: frigate
rules:
- alert: FrigateMQTTUnreachable
expr: |
sum(count_over_time({app="frigate"} |~ "(?i)unable to connect to mqtt server"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to reach MQTT"

- name: home-assistant
rules:
- alert: HomeAssistantPostgresUnreachable
expr: |
sum by (app) (count_over_time({app="home-assistant"} |~ "(?i)error in database connectivity"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
app: "{{ $labels.app }}"
summary: "{{ $labels.app }} is unable to connect to postgres"

# - name: zwave-js-ui
# rules:
# - alert: ZwaveMQTTUnreachable
# expr: |
# sum(count_over_time({app="zwave-js-ui"} |~ "(?i)error while connecting mqtt"[2m])) > 0
# for: 2m
# labels:
# severity: critical
# category: logs
# annotations:
# app: "{{ $labels.app }}"
# summary: "{{ $labels.app }} is unable to reach MQTT"

# - name: zigbee2mqtt
# rules:
# - alert: ZigbeeMQTTUnreachable
# expr: |
# sum(count_over_time({app="zigbee2mqtt"} |~ "(?i)not connected to mqtt server"[2m])) > 0
# for: 2m
# labels:
# severity: critical
# category: logs
# annotations:
# app: "{{ $labels.app }}"
# summary: "{{ $labels.app }} is unable to reach MQTT"
Loading

0 comments on commit b114adc

Please sign in to comment.