Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
---
- hosts: monitoring
become: yes
become_user: root
roles:
- role: common
papertrail_url: "{{ vars_papertrail_url }}"
collect_metrics_from:
- 127.0.0.1
sudo_users:
- aidanhs
- simulacrum
- pietro
- role: backup
repository: "{{ vars_backup_repository }}"
password: "{{ vars_backup_password }}"
env: "{{ vars_backup_env }}"
interval: daily
keep_daily: 7
keep_weekly: 4
- role: postgresql
users:
grafana:
password: "{{ vars_postgresql_grafana_password }}"
databases:
grafana:
owner: grafana
encoding: UTF-8
- role: monitoring-server
prometheus_scrape_interval: 5s
prometheus_retention: 30d
alertmanager_resolve_timeout: 5m
prometheus_scrape:
# Metrics about prometheus itself
- job_name: prometheus
static_configs:
- targets: ['{{ inventory_hostname }}:9090']
# System metrics scraped from node_exporter
- job_name: node
static_configs:
# List of instances to scrape for system metrics
- targets:
- "{{ inventory_hostname }}:9100"
- ci-arm-1.infra.rust-lang.org:9100
- crater-azure-1.infra.rust-lang.org:9100
- crater-azure-2.infra.rust-lang.org:9100
- crater.rust-lang.org:9100
- docsrs.infra.rust-lang.org:9100
- bastion.infra.rust-lang.org:9100
- play-1.infra.rust-lang.org:9100
- dev-desktop.infra.rust-lang.org:9100
- job_name: docsrs
metrics_path: /about/metrics
scheme: https
static_configs:
- targets:
- docs.rs:443
- job_name: perfrlo
metrics_path: /perf/metrics
scheme: https
static_configs:
- targets:
- perf.rust-lang.org:443
- job_name: crater
scheme: https
static_configs:
- targets:
- crater.rust-lang.org:443
# Crater's metrics endpoint is relatively slow
scrape_interval: 1m
scrape_timeout: 30s
- job_name: monitorbot
scheme: https
bearer_token: "{{ vars_prometheus_monitorbot_secret }}"
static_configs:
- targets:
- monitorbot.infra.rust-lang.org:443
- job_name: cratesio_service
scheme: https
bearer_token: "{{ vars_prometheus_cratesio_secret }}"
metrics_path: /api/private/metrics/service
static_configs:
- targets:
- staging-crates-io.herokuapp.com:443
labels:
env: staging
- targets:
- crates-io.herokuapp.com:443
labels:
env: prod
- job_name: playground
metrics_path: /metrics
scheme: https
static_configs:
- targets:
- play.rust-lang.org:443
- job_name: cratesio_heroku_metrics
scheme: https
basic_auth:
username: metrics
password: "{{ vars_prometheus_cratesio_heroku_metrics_secret }}"
static_configs:
- targets:
- crates-io-heroku-metrics.infra.rust-lang.org:443
prometheus_rule_groups:
- name: node
rules:
- alert: NodeDown
expr: up{job="node"} == 0
for: 2m
annotations:
summary: "{{ '{{ $labels.instance }}' }} is down"
description: "Prometheus wasn't able to reach the instance {{ '{{ $labels.instance }}' }} for more than 2 minutes."
- alert: SystemdUnitFailed
expr: node_systemd_unit_state{state="failed",job="node"} == 1
for: 1m
annotations:
summary: "{{ '{{ $labels.name }}' }} on {{ '{{ $labels.instance }}' }} failed"
description: "The systemd unit {{ '{{ $labels.name }}' }} on the {{ '{{ $labels.instance }}' }} instance failed to execute."
- alert: DiskFull
expr: node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="fuse.lxcfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="fuse.lxcfs"} < 0.10
for: 1m
annotations:
summary: "Filesystem {{ '{{ $labels.mountpoint }}' }} on {{ '{{ $labels.instance }}' }} is full"
description: "There is less than 10% disk space left on the mount point {{ '{{ $labels.mountpoint }}' }} (device {{ '{{ $labels.device }}' }}) of the instance {{ '{{ $labels.instance }}' }}."
- alert: TooMuchRamUsage
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.2
for: 1m
annotations:
summary: "High RAM usage on {{ '{{ $labels.instance }}' }}"
description: "There is less than 20% available RAM on the {{ '{{ $labels.instance }}' }} instance, for more than 1 minute."
- name: docsrs
rules:
- alert: DocsRsDown
expr: up{job="docsrs"} == 0
for: 1m
labels:
dispatch: docsrs
annotations:
summary: "docs.rs is down"
description: "Prometheus wasn't able to reach the docs.rs website for more than a minute."
- alert: NoSuccessfulBuilds
expr: max_over_time(docsrs_queued_crates_count{job="docsrs"}[1h]) > 0 and increase(docsrs_successful_builds{job="docsrs"}[1h]) < 1
for: 1m
labels:
dispatch: docsrs
annotations:
summary: "No successful docs.rs builds in the last hour"
description: "no documentation builds were successful in the last hour."
- alert: DocsRsQueueLocked
expr: docsrs_queue_is_locked{job="docsrs"} == 1
for: 1m
labels:
dispatch: docsrs
annotations:
summary: "docs.rs queue is locked"
description: "the build queue is marked as locked for more than a minute."
- alert: LongQueue
expr: docsrs_prioritized_crates_count{job="docsrs"} >= 50
for: 12h
labels:
dispatch: docsrs
annotations:
summary: "The docs.rs build queue is unreasonably long"
description: "There are more than 50 crates in the docs.rs queue, and the situation didn't resolve itself in the past 12 hours. The build queue is available at https://docs.rs/releases/queue"
- alert: LongDeprioritizedQueue
expr: docsrs_queued_crates_count{job="docsrs"} - docsrs_prioritized_crates_count{job="docsrs"} >= 1000
for: 24h
labels:
dispatch: docsrs
annotations:
summary: "The docs.rs deprioritized queue is unreasonably long"
description: "There are more than 1000 deprioritized crates in the docs.rs queue, and the situation didn't resolve itself in the 24 hours. The build queue is available at https://docs.rs/releases/queue"
- alert: FailingBuilds
expr: increase(docsrs_failed_builds{job="docsrs"}[3h]) / increase(docsrs_total_builds{job="docsrs"}[3h]) > 0.75
for: 1m
labels:
dispatch: docsrs
annotations:
summary: "Most of the docs.rs builds are failing"
description: "More than 75% of the crate builds failed over the past 3 hours. Recent failures are available at https://docs.rs/releases/recent-failures."
- alert: BrokenDatabase
expr: increase(docsrs_failed_db_connections{job="docsrs"}[1m]) > 0
for: 1m
labels:
dispatch: docsrs
annotations:
summary: "Database connections are failing on docs.rs"
description: "In the past minute some database connections failed to be established!"
- name: crater
rules:
- alert: StuckAgent
expr: (sum by (agent) (increase(crater_completed_jobs_total{job="crater"}[5m]))) == 0 and (sum by (agent) (crater_agent_supposed_to_work{job="crater"})) == 1
for: 25m
annotations:
summary: "Crater agent {{ '{{ $labels.agent }}' }} is stuck"
description: "The Crater agent {{ '{{ $labels.agent }}' }} has work to do, but it reported no results for the past 30 minutes."
- name: github
rules:
- alert: LowGitHubAPIQuota
expr: monitorbot_github_rate_limit_remaining{job="monitorbot"} / monitorbot_github_rate_limit_limit{job="monitorbot"} < 0.2
for: 1m
annotations:
summary: "GitHub user {{ '{{ $labels.username }}' }} has low API quota"
description: "The {{ '{{ $labels.product }}' }} API quota for GitHub user {{ '{{ $labels.username }}' }} is less than 20%."
- alert: SelfHostedRunnerOffline
expr: monitorbot_gha_runner_online{job="monitorbot"} != 1
for: 5m
annotations:
summary: "Self-Hosted runner {{ '{{ $labels.runner }}' }} in repo {{ '{{ $labels.repo }}' }} is offline"
description: "GitHub is reporting that the self-hosted runner {{ '{{ $labels.runner }}' }} in the repo {{ '{{ $labels.repo }}' }} is offline. This could be caused by the machine hosting the runner having problems, or by the runner software being out of date."
alertmanager_route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
receiver: zulip-infra
routes:
- receiver: discord-docsrs
group_by: ['alertname']
match:
dispatch: docsrs
- receiver: discord-docsrs
group_by: ['alertname']
match:
job: node
instance: docsrs.infra.rust-lang.org:9100
# Suppress disk full alerts for Crater agents. They're supposed to go
# near the limit and they're configured to handle things properly,
# cleaning up unused files when they reach a threshold.
- receiver: devnull
match:
job: node
alertname: DiskFull
match_re:
instance: ^crater-[^\.]+\.infra\.rust-lang\.org:9100$
alertmanager_receivers:
- name: zulip-infra
slack_configs:
- api_url: "{{ vars_alertmanager_receiver_zulip_infra }}"
channel: alertmanager
title: "{{ '{{ .Alerts.Firing | len }} alerts fired, {{ .Alerts.Resolved | len }} alerts resolved!' }}"
title_link: "https://grafana.rust-lang.org"
text: "{{ '{{ range .Alerts }}**{{ if eq .Status \"firing\" }}Fired{{ else }}Resolved{{ end }}: {{ .Annotations.summary }}**\n{{ .Annotations.description }}\n{{ end }}' }}"
send_resolved: true
- name: discord-docsrs
slack_configs:
- api_url: "{{ vars_alertmanager_receiver_discord_docsrs }}/slack"
channel: doesnt-matter
title: "{{ '{{ .Alerts.Firing | len }} alerts fired, {{ .Alerts.Resolved | len }} alerts resolved!' }}"
title_link: "https://grafana.rust-lang.org"
text: "{{ '{{ range .Alerts }}**{{ if eq .Status \"firing\" }}Fired{{ else }}Resolved{{ end }}: {{ .Annotations.summary }}**\n{{ .Annotations.description }}\n{{ end }}' }}"
send_resolved: true
- name: devnull
grafana_github_teams:
- 2357301 # rust-lang/infra
- 3552538 # rust-lang/docs-rs
- 2948097 # rust-lang/crates-io
- 5083043 # rust-lang/crates-io-on-call
grafana_domain: "{{ vars_grafana_domain }}"
grafana_admin_password: "{{ vars_grafana_admin_password }}"
grafana_github_oauth_id: "{{ vars_grafana_github_oauth_id }}"
grafana_github_oauth_secret: "{{ vars_grafana_github_oauth_secret }}"
grafana_db_name: grafana
grafana_db_user: grafana
grafana_db_password: "{{ vars_postgresql_grafana_password }}"
- role: letsencrypt
dummy_certs: "{{ vars_letsencrypt_dummy_certs }}"
email: admin@rust-lang.org
domains:
- "{{ vars_grafana_domain }}"
- role: nginx
proxied:
- domain: "{{ vars_grafana_domain }}"
to: http://localhost:3000