Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
315 lines (274 sloc)
12.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
- hosts: monitoring | |
become: yes | |
become_user: root | |
roles: | |
- role: common | |
papertrail_url: "{{ vars_papertrail_url }}" | |
collect_metrics_from: | |
- 127.0.0.1 | |
sudo_users: | |
- aidanhs | |
- simulacrum | |
- pietro | |
- role: backup | |
repository: "{{ vars_backup_repository }}" | |
password: "{{ vars_backup_password }}" | |
env: "{{ vars_backup_env }}" | |
interval: daily | |
keep_daily: 7 | |
keep_weekly: 4 | |
- role: postgresql | |
users: | |
grafana: | |
password: "{{ vars_postgresql_grafana_password }}" | |
databases: | |
grafana: | |
owner: grafana | |
encoding: UTF-8 | |
- role: monitoring-server | |
prometheus_scrape_interval: 5s | |
prometheus_retention: 30d | |
alertmanager_resolve_timeout: 5m | |
prometheus_scrape: | |
# Metrics about prometheus itself | |
- job_name: prometheus | |
static_configs: | |
- targets: ['{{ inventory_hostname }}:9090'] | |
# System metrics scraped from node_exporter | |
- job_name: node | |
static_configs: | |
# List of instances to scrape for system metrics | |
- targets: | |
- "{{ inventory_hostname }}:9100" | |
- ci-arm-1.infra.rust-lang.org:9100 | |
- crater-azure-1.infra.rust-lang.org:9100 | |
- crater-azure-2.infra.rust-lang.org:9100 | |
- crater.rust-lang.org:9100 | |
- docsrs.infra.rust-lang.org:9100 | |
- bastion.infra.rust-lang.org:9100 | |
- play-1.infra.rust-lang.org:9100 | |
- dev-desktop.infra.rust-lang.org:9100 | |
- job_name: docsrs | |
metrics_path: /about/metrics | |
scheme: https | |
static_configs: | |
- targets: | |
- docs.rs:443 | |
- job_name: perfrlo | |
metrics_path: /perf/metrics | |
scheme: https | |
static_configs: | |
- targets: | |
- perf.rust-lang.org:443 | |
- job_name: crater | |
scheme: https | |
static_configs: | |
- targets: | |
- crater.rust-lang.org:443 | |
# Crater's metrics endpoint is relatively slow | |
scrape_interval: 1m | |
scrape_timeout: 30s | |
- job_name: monitorbot | |
scheme: https | |
bearer_token: "{{ vars_prometheus_monitorbot_secret }}" | |
static_configs: | |
- targets: | |
- monitorbot.infra.rust-lang.org:443 | |
- job_name: cratesio_service | |
scheme: https | |
bearer_token: "{{ vars_prometheus_cratesio_secret }}" | |
metrics_path: /api/private/metrics/service | |
static_configs: | |
- targets: | |
- staging-crates-io.herokuapp.com:443 | |
labels: | |
env: staging | |
- targets: | |
- crates-io.herokuapp.com:443 | |
labels: | |
env: prod | |
- job_name: playground | |
metrics_path: /metrics | |
scheme: https | |
static_configs: | |
- targets: | |
- play.rust-lang.org:443 | |
- job_name: cratesio_heroku_metrics | |
scheme: https | |
basic_auth: | |
username: metrics | |
password: "{{ vars_prometheus_cratesio_heroku_metrics_secret }}" | |
static_configs: | |
- targets: | |
- crates-io-heroku-metrics.infra.rust-lang.org:443 | |
prometheus_rule_groups: | |
- name: node | |
rules: | |
- alert: NodeDown | |
expr: up{job="node"} == 0 | |
for: 2m | |
annotations: | |
summary: "{{ '{{ $labels.instance }}' }} is down" | |
description: "Prometheus wasn't able to reach the instance {{ '{{ $labels.instance }}' }} for more than 2 minutes." | |
- alert: SystemdUnitFailed | |
expr: node_systemd_unit_state{state="failed",job="node"} == 1 | |
for: 1m | |
annotations: | |
summary: "{{ '{{ $labels.name }}' }} on {{ '{{ $labels.instance }}' }} failed" | |
description: "The systemd unit {{ '{{ $labels.name }}' }} on the {{ '{{ $labels.instance }}' }} instance failed to execute." | |
- alert: DiskFull | |
expr: node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="fuse.lxcfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="fuse.lxcfs"} < 0.10 | |
for: 1m | |
annotations: | |
summary: "Filesystem {{ '{{ $labels.mountpoint }}' }} on {{ '{{ $labels.instance }}' }} is full" | |
description: "There is less than 10% disk space left on the mount point {{ '{{ $labels.mountpoint }}' }} (device {{ '{{ $labels.device }}' }}) of the instance {{ '{{ $labels.instance }}' }}." | |
- alert: TooMuchRamUsage | |
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.2 | |
for: 1m | |
annotations: | |
summary: "High RAM usage on {{ '{{ $labels.instance }}' }}" | |
description: "There is less than 20% available RAM on the {{ '{{ $labels.instance }}' }} instance, for more than 1 minute." | |
- name: docsrs | |
rules: | |
- alert: DocsRsDown | |
expr: up{job="docsrs"} == 0 | |
for: 1m | |
labels: | |
dispatch: docsrs | |
annotations: | |
summary: "docs.rs is down" | |
description: "Prometheus wasn't able to reach the docs.rs website for more than a minute." | |
- alert: NoSuccessfulBuilds | |
expr: max_over_time(docsrs_queued_crates_count{job="docsrs"}[1h]) > 0 and increase(docsrs_successful_builds{job="docsrs"}[1h]) < 1 | |
for: 1m | |
labels: | |
dispatch: docsrs | |
annotations: | |
summary: "No successful docs.rs builds in the last hour" | |
description: "no documentation builds were successful in the last hour." | |
- alert: DocsRsQueueLocked | |
expr: docsrs_queue_is_locked{job="docsrs"} == 1 | |
for: 1m | |
labels: | |
dispatch: docsrs | |
annotations: | |
summary: "docs.rs queue is locked" | |
description: "the build queue is marked as locked for more than a minute." | |
- alert: LongQueue | |
expr: docsrs_prioritized_crates_count{job="docsrs"} >= 50 | |
for: 12h | |
labels: | |
dispatch: docsrs | |
annotations: | |
summary: "The docs.rs build queue is unreasonably long" | |
description: "There are more than 50 crates in the docs.rs queue, and the situation didn't resolve itself in the past 12 hours. The build queue is available at https://docs.rs/releases/queue" | |
- alert: LongDeprioritizedQueue | |
expr: docsrs_queued_crates_count{job="docsrs"} - docsrs_prioritized_crates_count{job="docsrs"} >= 1000 | |
for: 24h | |
labels: | |
dispatch: docsrs | |
annotations: | |
summary: "The docs.rs deprioritized queue is unreasonably long" | |
description: "There are more than 1000 deprioritized crates in the docs.rs queue, and the situation didn't resolve itself in the 24 hours. The build queue is available at https://docs.rs/releases/queue" | |
- alert: FailingBuilds | |
expr: increase(docsrs_failed_builds{job="docsrs"}[3h]) / increase(docsrs_total_builds{job="docsrs"}[3h]) > 0.75 | |
for: 1m | |
labels: | |
dispatch: docsrs | |
annotations: | |
summary: "Most of the docs.rs builds are failing" | |
description: "More than 75% of the crate builds failed over the past 3 hours. Recent failures are available at https://docs.rs/releases/recent-failures." | |
- alert: BrokenDatabase | |
expr: increase(docsrs_failed_db_connections{job="docsrs"}[1m]) > 0 | |
for: 1m | |
labels: | |
dispatch: docsrs | |
annotations: | |
summary: "Database connections are failing on docs.rs" | |
description: "In the past minute some database connections failed to be established!" | |
- name: crater | |
rules: | |
- alert: StuckAgent | |
expr: (sum by (agent) (increase(crater_completed_jobs_total{job="crater"}[5m]))) == 0 and (sum by (agent) (crater_agent_supposed_to_work{job="crater"})) == 1 | |
for: 25m | |
annotations: | |
summary: "Crater agent {{ '{{ $labels.agent }}' }} is stuck" | |
description: "The Crater agent {{ '{{ $labels.agent }}' }} has work to do, but it reported no results for the past 30 minutes." | |
- name: github | |
rules: | |
- alert: LowGitHubAPIQuota | |
expr: monitorbot_github_rate_limit_remaining{job="monitorbot"} / monitorbot_github_rate_limit_limit{job="monitorbot"} < 0.2 | |
for: 1m | |
annotations: | |
summary: "GitHub user {{ '{{ $labels.username }}' }} has low API quota" | |
description: "The {{ '{{ $labels.product }}' }} API quota for GitHub user {{ '{{ $labels.username }}' }} is less than 20%." | |
- alert: SelfHostedRunnerOffline | |
expr: monitorbot_gha_runner_online{job="monitorbot"} != 1 | |
for: 5m | |
annotations: | |
summary: "Self-Hosted runner {{ '{{ $labels.runner }}' }} in repo {{ '{{ $labels.repo }}' }} is offline" | |
description: "GitHub is reporting that the self-hosted runner {{ '{{ $labels.runner }}' }} in the repo {{ '{{ $labels.repo }}' }} is offline. This could be caused by the machine hosting the runner having problems, or by the runner software being out of date." | |
alertmanager_route: | |
group_by: ['alertname'] | |
group_wait: 30s | |
group_interval: 5m | |
receiver: zulip-infra | |
routes: | |
- receiver: discord-docsrs | |
group_by: ['alertname'] | |
match: | |
dispatch: docsrs | |
- receiver: discord-docsrs | |
group_by: ['alertname'] | |
match: | |
job: node | |
instance: docsrs.infra.rust-lang.org:9100 | |
# Suppress disk full alerts for Crater agents. They're supposed to go | |
# near the limit and they're configured to handle things properly, | |
# cleaning up unused files when they reach a threshold. | |
- receiver: devnull | |
match: | |
job: node | |
alertname: DiskFull | |
match_re: | |
instance: ^crater-[^\.]+\.infra\.rust-lang\.org:9100$ | |
alertmanager_receivers: | |
- name: zulip-infra | |
slack_configs: | |
- api_url: "{{ vars_alertmanager_receiver_zulip_infra }}" | |
channel: alertmanager | |
title: "{{ '{{ .Alerts.Firing | len }} alerts fired, {{ .Alerts.Resolved | len }} alerts resolved!' }}" | |
title_link: "https://grafana.rust-lang.org" | |
text: "{{ '{{ range .Alerts }}**{{ if eq .Status \"firing\" }}Fired{{ else }}Resolved{{ end }}: {{ .Annotations.summary }}**\n{{ .Annotations.description }}\n{{ end }}' }}" | |
send_resolved: true | |
- name: discord-docsrs | |
slack_configs: | |
- api_url: "{{ vars_alertmanager_receiver_discord_docsrs }}/slack" | |
channel: doesnt-matter | |
title: "{{ '{{ .Alerts.Firing | len }} alerts fired, {{ .Alerts.Resolved | len }} alerts resolved!' }}" | |
title_link: "https://grafana.rust-lang.org" | |
text: "{{ '{{ range .Alerts }}**{{ if eq .Status \"firing\" }}Fired{{ else }}Resolved{{ end }}: {{ .Annotations.summary }}**\n{{ .Annotations.description }}\n{{ end }}' }}" | |
send_resolved: true | |
- name: devnull | |
grafana_github_teams: | |
- 2357301 # rust-lang/infra | |
- 3552538 # rust-lang/docs-rs | |
- 2948097 # rust-lang/crates-io | |
- 5083043 # rust-lang/crates-io-on-call | |
grafana_domain: "{{ vars_grafana_domain }}" | |
grafana_admin_password: "{{ vars_grafana_admin_password }}" | |
grafana_github_oauth_id: "{{ vars_grafana_github_oauth_id }}" | |
grafana_github_oauth_secret: "{{ vars_grafana_github_oauth_secret }}" | |
grafana_db_name: grafana | |
grafana_db_user: grafana | |
grafana_db_password: "{{ vars_postgresql_grafana_password }}" | |
- role: letsencrypt | |
dummy_certs: "{{ vars_letsencrypt_dummy_certs }}" | |
email: admin@rust-lang.org | |
domains: | |
- "{{ vars_grafana_domain }}" | |
- role: nginx | |
proxied: | |
- domain: "{{ vars_grafana_domain }}" | |
to: http://localhost:3000 |