Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
2 contributors

Users who have contributed to this file

@pietroalbini @Mark-Simulacrum
178 lines (152 sloc) 6.19 KB
---
- hosts: monitoring
become: yes
become_user: root
roles:
- role: common
papertrail_url: "{{ vars_papertrail_url }}"
collect_metrics_from:
- 127.0.0.1
sudo_users:
- aidanhs
- acrichto
- simulacrum
- pietro
- role: backup
repository: "{{ vars_backup_repository }}"
password: "{{ vars_backup_password }}"
env: "{{ vars_backup_env }}"
interval: daily
keep_daily: 7
keep_weekly: 4
- role: postgresql
users:
grafana:
password: "{{ vars_postgresql_grafana_password }}"
databases:
grafana:
owner: grafana
encoding: UTF-8
- role: monitoring-server
prometheus_scrape_interval: 5s
prometheus_retention: 7d
alertmanager_resolve_timeout: 5m
prometheus_scrape:
# Metrics about prometheus itself
- job_name: prometheus
static_configs:
- targets: ['{{ inventory_hostname }}:9090']
# System metrics scraped from node_exporter
- job_name: node
static_configs:
# List of instances to scrape for system metrics
- targets:
- "{{ inventory_hostname }}:9100"
- crater-aws-1.infra.rust-lang.org:9100
- crater-azure-1.infra.rust-lang.org:9100
- perf.rust-lang.org:9100
- bots.infra.rust-lang.org:9100
- rcs.infra.rust-lang.org:9100
- docsrs.infra.rust-lang.org:9100
- bastion.infra.rust-lang.org:9100
- play.infra.rust-lang.org:9100
- puppeteer.infra.rust-lang.org:9100
- job_name: docsrs
metrics_path: /about/metrics
scheme: https
static_configs:
- targets:
- docs.rs:443
- job_name: crater
scheme: https
static_configs:
- targets:
- crater.rust-lang.org:443
prometheus_rule_groups:
- name: node
rules:
- alert: NodeDown
expr: up{job="node"} == 0
for: 2m
annotations:
summary: "{{ '{{ $labels.instance }}' }} is down"
description: "Prometheus wasn't able to reach the instance {{ '{{ $labels.instance }}' }} for more than 2 minutes."
- alert: SystemdUnitFailed
expr: node_systemd_unit_state{state="failed",job="node"} == 1
for: 1m
annotations:
summary: "{{ '{{ $labels.name }}' }} on {{ '{{ $labels.instance }}' }} failed"
description: "The systemd unit {{ '{{ $labels.name }}' }} on the {{ '{{ $labels.instance }}' }} instance failed to execute."
- alert: DiskFull
expr: node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="fuse.lxcfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="fuse.lxcfs"} < 0.10
for: 1m
annotations:
summary: "Filesystem {{ '{{ $labels.mountpoint }}' }} on {{ '{{ $labels.instance }}' }} is full"
description: "There is less than 10% disk space left on the mount point {{ '{{ $labels.mountpoint }}' }} (device {{ '{{ $labels.device }}' }}) of the instance {{ '{{ $labels.instance }}' }}."
- name: docsrs
rules:
- alert: LongQueue
expr: docsrs_queued_crates_count{job="docsrs"} >= 100
for: 1h
labels:
dispatch: docsrs
annotations:
summary: "The docs.rs build queue is unreasonably long"
description: "There are more than 100 crates in the docs.rs queue, and the situation didn't resolve itself in the past hour. The build queue is available at https://docs.rs/releases/queue"
alertmanager_route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
receiver: discord-infra
routes:
- receiver: discord-docsrs
group_by: ['alertname']
match:
dispatch: docsrs
- receiver: discord-docsrs
group_by: ['alertname']
match:
job: node
instance: docsrs.infra.rust-lang.org:9100
# Suppress disk full alerts for Crater agents. They're supposed to go
# near the limit and they're configured to handle things properly,
# cleaning up unused files when they reach a threshold.
- receiver: devnull
match:
job: node
alertname: DiskFull
match_re:
instance: ^crater-[^\.]+\.infra\.rust-lang\.org:9100$
alertmanager_receivers:
- name: discord-infra
slack_configs:
- api_url: "{{ vars_alertmanager_receiver_discord_infra }}/slack"
channel: doesnt-matter
title: "{{ '{{ .Alerts.Firing | len }} alerts fired!' }}"
text: "{{ '{{ range .Alerts }}**{{ .Annotations.summary }}**\n{{ .Annotations.description }}\n{{ end }}\ncc @infra-page' }}"
- name: discord-docsrs
slack_configs:
- api_url: "{{ vars_alertmanager_receiver_discord_docsrs }}/slack"
channel: doesnt-matter
title: "{{ '{{ .Alerts.Firing | len }} alerts fired!' }}"
text: "{{ '{{ range .Alerts }}**{{ .Annotations.summary }}**\n{{ .Annotations.description }}\n{{ end }}\ncc @docsrs-page' }}"
- name: devnull
grafana_github_teams:
- 2357301 # rust-lang/infra
- 3484533 # rust-lang/docsrs-ops
grafana_domain: "{{ vars_grafana_domain }}"
grafana_admin_password: "{{ vars_grafana_admin_password }}"
grafana_github_oauth_id: "{{ vars_grafana_github_oauth_id }}"
grafana_github_oauth_secret: "{{ vars_grafana_github_oauth_secret }}"
grafana_db_name: grafana
grafana_db_user: grafana
grafana_db_password: "{{ vars_postgresql_grafana_password }}"
- role: letsencrypt
dummy_certs: "{{ vars_letsencrypt_dummy_certs }}"
email: admin@rust-lang.org
domains:
- "{{ vars_grafana_domain }}"
- role: nginx
proxied:
- domain: "{{ vars_grafana_domain }}"
to: http://localhost:3000
You can’t perform that action at this time.