diff --git a/base-helm-configs/grafana/grafana-helm-overrides.yaml b/base-helm-configs/grafana/grafana-helm-overrides.yaml index 9d5f6a13d..08b195aa9 100644 --- a/base-helm-configs/grafana/grafana-helm-overrides.yaml +++ b/base-helm-configs/grafana/grafana-helm-overrides.yaml @@ -38,93 +38,143 @@ grafana.ini: type: mysql host: mariadb-cluster.grafana.svc:3306 user: $__file{/etc/secrets/grafana-db/username} - password: $__file{/etc/secrets/grafana-db/password} + password: $__file{/etc/secrets/grafana-db/password} name: grafana datasources: datasources.yaml: apiversion: 1 datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 - isdefault: true - - name: Loki - type: loki - access: proxy - url: http://loki-gateway.{{ $.Release.Namespace }}.svc.cluster.local:80 - editable: false + - name: Prometheus + type: prometheus + access: proxy + url: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 + isdefault: true + - name: Loki + type: loki + access: proxy + url: http://loki-gateway.{{ $.Release.Namespace }}.svc.cluster.local:80 + editable: false alerting: rules.yaml: groups: - - orgId: 1 - name: loki 1 min eval - folder: rules - interval: 1m - rules: - - uid: ba943125-33ca-4e4e-85f8-13359a8e4d65 - title: OVN claim storm - condition: B - data: - - refId: A + - orgId: 1 + name: loki 1 min eval + folder: rules + interval: 1m + rules: + - uid: ba943125-33ca-4e4e-85f8-13359a8e4d65 + title: OVN claim storm + condition: B + data: + - refId: A + queryType: instant + relativeTimeRange: + from: 60 + to: 0 + datasourceUid: P8E80F9AEF21F6940 + model: + editorMode: builder + expr: rate({app="ovs"} |= `binding|INFO|cr-lrp` [1m]) + intervalMs: 60000 + maxDataPoints: 43200 + queryType: instant + refId: A + - refId: B + relativeTimeRange: + from: 60 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: OK + execErrState: Error + for: 0s + notifications: + - uid: prom-alertmanager-notification + annotations: + description: >- + Checks app=ovs (ovs-ovn) pod logs for lines with string + 'binding|INFO|cr-lrp' + summary: >- + This alerts on rapid port claims for cr-lrp ports on OVN + gateway nodes, which overloads the OVN south database and + interferes with the function of the affected ports. + labels: {} + isPaused: false + # Generated UUID using 'uuidgen' + - uid: c14dd8fd-54ec-4e15-9813-e02cc3269899 + title: Neutron IPAM Duplicate Entry Error + condition: C + data: + - refId: A + queryType: instant + relativeTimeRange: + from: 60 + to: 0 + # Using same loki datasource as rule#ba943125-33ca-4e4e-85f8-13359a8e4d65 + datasourceUid: P8E80F9AEF21F6940 + model: + expr: rate({app="fluentbit"} |= `Duplicate entry|ERROR` [1m]) queryType: instant - relativeTimeRange: - from: 60 - to: 0 - datasourceUid: P8E80F9AEF21F6940 - model: - editorMode: builder - expr: rate({app="ovs"} |= `binding|INFO|cr-lrp` [1m]) - intervalMs: 60000 - maxDataPoints: 43200 - queryType: instant - refId: A - - refId: B - relativeTimeRange: - from: 60 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 1 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - refId: B - type: threshold - noDataState: OK - execErrState: Error - for: 0s - notifications: - - uid: prom-alertmanager-notification - annotations: - description: >- - Checks app=ovs (ovs-ovn) pod logs for lines with string - 'binding|INFO|cr-lrp' - summary: >- - This alerts on rapid port claims for cr-lrp ports on OVN - gateway nodes, which overloads the OVN south database and - interferes with the function of the affected ports. - labels: {} - isPaused: false + refId: A + - refId: B + relativeTimeRange: + # Past 60 seconds (can be adjusted further) + from: 60 + # 0 denotes till current time + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + - 0 + type: gt + operator: + type: and + reducer: + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + refId: B + type: threshold + noDataState: OK + execErrState: Error + notifications: + - uid: prom-alertmanager-notification + annotations: + summary: > + Checks for log lines containing 'Duplicate entry|ERROR' in nova logs. + isPaused: false contactpoints.yaml: secret: apiVersion: 1