From 152c55b28a2a41fb57115bc706e85b91bf09642e Mon Sep 17 00:00:00 2001 From: MasterPtato <23087326+MasterPtato@users.noreply.github.com> Date: Wed, 26 Jun 2024 17:47:27 +0000 Subject: [PATCH] feat(clusters): gg monitor for better uptime (#921) Fixes RVTEE-77 ## Changes --- infra/tf/better_uptime/main.tf | 2 ++ infra/tf/better_uptime/vars.tf | 1 + .../cluster-provisioning.json | 2 +- lib/bolt/core/src/dep/terraform/gen.rs | 23 +++++++++++-- svc/api/cloud/src/route/devices/links.rs | 2 +- .../install_scripts/components/mod.rs | 1 + .../install_scripts/components/ok_server.rs | 13 ++++++++ .../install_scripts/components/traefik.rs | 29 ++++++++++++++++ .../install_scripts/files/ok_server.sh | 33 +++++++++++++++++++ .../server_install/install_scripts/mod.rs | 8 +++-- .../worker/src/workers/server_install/mod.rs | 2 +- 11 files changed, 109 insertions(+), 7 deletions(-) create mode 100644 svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/ok_server.rs create mode 100644 svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/ok_server.sh diff --git a/infra/tf/better_uptime/main.tf b/infra/tf/better_uptime/main.tf index 27fb242e29..4715b25bc7 100644 --- a/infra/tf/better_uptime/main.tf +++ b/infra/tf/better_uptime/main.tf @@ -64,6 +64,8 @@ resource "betteruptime_monitor" "monitor" { ] email = var.better_uptime_notify push = var.better_uptime_notify + verify_ssl = try(each.value.monitor.verify_ssl, false) + ssl_expiration = try(each.value.monitor.verify_ssl, false) ? 14 : null } resource "betteruptime_status_page_resource" "status_page_resource" { diff --git a/infra/tf/better_uptime/vars.tf b/infra/tf/better_uptime/vars.tf index db0c2d8e99..91e5f662d5 100644 --- a/infra/tf/better_uptime/vars.tf +++ b/infra/tf/better_uptime/vars.tf @@ -7,6 +7,7 @@ variable "better_uptime_groups" { id = string url = string public_name = string + verify_ssl = optional(bool) })) })) } diff --git a/infra/tf/grafana/grafana_dashboards/cluster-provisioning.json b/infra/tf/grafana/grafana_dashboards/cluster-provisioning.json index 127eafd19e..3e50120fc5 100644 --- a/infra/tf/grafana/grafana_dashboards/cluster-provisioning.json +++ b/infra/tf/grafana/grafana_dashboards/cluster-provisioning.json @@ -946,7 +946,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (pool_type, datacenter_id) (provision_draining_tainted_servers{cluster_id=~\"[[cluster_id]]\", datacenter_id=~\"[[datacenter_id]]\", provider_datacenter_id=~\"[[provider_datacenter_id]]\", pool_type=~\"[[pool_type]]\"})", + "expr": "sum by (pool_type, datacenter_id) (rivet_provision_draining_tainted_servers{cluster_id=~\"[[cluster_id]]\", datacenter_id=~\"[[datacenter_id]]\", provider_datacenter_id=~\"[[provider_datacenter_id]]\", pool_type=~\"[[pool_type]]\"})", "instant": false, "legendFormat": "{{pool_type}} ({{datacenter_id}})", "range": true, diff --git a/lib/bolt/core/src/dep/terraform/gen.rs b/lib/bolt/core/src/dep/terraform/gen.rs index c3d6c5b496..22631662ea 100644 --- a/lib/bolt/core/src/dep/terraform/gen.rs +++ b/lib/bolt/core/src/dep/terraform/gen.rs @@ -544,7 +544,7 @@ async fn vars(ctx: &ProjectContext) { }; // Create monitors - let mm_monitors = cluster + let api_status_monitors = cluster .datacenters .iter() .map(|(name_id, dc)| { @@ -555,6 +555,22 @@ async fn vars(ctx: &ProjectContext) { }) }) .collect::>(); + let gg_monitors = if let Some(domain_job) = ctx.domain_job() { + cluster + .datacenters + .values() + .map(|dc| { + json!({ + "id": format!("{}-gg", dc.datacenter_id), + "url": format!("https://lobby.{}.{domain_job}/status", dc.datacenter_id), + "public_name": format!("{} (GG)", dc.display_name), + "verify_ssl": true, + }) + }) + .collect::>() + } else { + Vec::new() + }; vars.insert( "better_uptime_groups".into(), @@ -562,7 +578,10 @@ async fn vars(ctx: &ProjectContext) { { "id": "mm", "name": "Matchmaker", - "monitors": mm_monitors, + "monitors": api_status_monitors + .into_iter() + .chain(gg_monitors) + .collect::>(), }, { "id": "cdn", diff --git a/svc/api/cloud/src/route/devices/links.rs b/svc/api/cloud/src/route/devices/links.rs index a13051061a..995e9ec920 100644 --- a/svc/api/cloud/src/route/devices/links.rs +++ b/svc/api/cloud/src/route/devices/links.rs @@ -81,7 +81,7 @@ pub async fn complete( ) -> GlobalResult { // Verify completer is a user. Cloud tokens should not be able to link other // cloud tokens. - let rivet_claims::ent::User { .. } = ctx.auth().claims()?.as_user()?; + ctx.auth().claims()?.as_user()?; // Verify has access to game ctx.auth() diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/mod.rs b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/mod.rs index e5604d3065..606eaf7d22 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/mod.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/mod.rs @@ -1,6 +1,7 @@ use indoc::indoc; pub mod nomad; +pub mod ok_server; pub mod rivet; pub mod s3; pub mod traefik; diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/ok_server.rs b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/ok_server.rs new file mode 100644 index 0000000000..00e62f2e7d --- /dev/null +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/ok_server.rs @@ -0,0 +1,13 @@ +pub const OK_SERVER_PORT: usize = 9999; + +pub fn install(initialize_immediately: bool) -> String { + let mut script = include_str!("../files/ok_server.sh") + .replace("__OK_SERVER_PORT__", &OK_SERVER_PORT.to_string()); + + if initialize_immediately { + // Run script immediately + script.push_str("systemctl start --no-block ok_server.service"); + } + + script +} diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/traefik.rs b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/traefik.rs index 63c170597e..ea48c099a9 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/traefik.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/traefik.rs @@ -4,6 +4,7 @@ use chirp_worker::prelude::*; use indoc::formatdoc; use super::{ + ok_server::OK_SERVER_PORT, vector::{TUNNEL_VECTOR_PORT, TUNNEL_VECTOR_TCP_JSON_PORT}, TUNNEL_API_INTERNAL_PORT, }; @@ -282,3 +283,31 @@ pub async fn gg_static_config() -> GlobalResult { Ok(config) } + +pub fn gg_dynamic_config(datacenter_id: Uuid) -> GlobalResult { + let domain_job = unwrap!(util::env::domain_job(), "dns not enabled"); + + let main = format!("{datacenter_id}.{domain_job}"); + + Ok(formatdoc!( + r#" + # Always returns 200 at /status + [http.routers.ok-status] + entryPoints = ["lb-80"] + rule = "Host(`lobby.{main}`) && Path(`/status`)" + service = "ok-service" + + [http.routers.ok-status-secure] + entryPoints = ["lb-443"] + rule = "Host(`lobby.{main}`) && Path(`/status`)" + service = "ok-service" + [[http.routers.ok-status-secure.tls.domains]] + main = "{main}" + sans = [] + + [http.services.ok-service.loadBalancer] + [[http.services.ok-service.loadBalancer.servers]] + url = "http://127.0.0.1:{OK_SERVER_PORT}" + "# + )) +} diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/ok_server.sh b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/ok_server.sh new file mode 100644 index 0000000000..486a9d3a0b --- /dev/null +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/ok_server.sh @@ -0,0 +1,33 @@ +# Write script +cat << 'EOF' > /usr/bin/ok_server.sh +#!/bin/bash +set -e + +trap "exit" INT +while true; do + { echo -e 'HTTP/1.1 200 OK\r\n\r\n'; } | nc -l -p __OK_SERVER_PORT__ -q 0; +done +EOF + +chmod +x /usr/bin/ok_server.sh + +# Create systemd service file +cat << 'EOF' > /etc/systemd/system/ok_server.service +[Unit] +Description=Rivet Ok Server +Requires=network-online.target +After=network-online.target + +[Service] +User=root +Group=root +Type=oneshot +ExecStart=/usr/bin/ok_server.sh +Type=simple + +[Install] +WantedBy=multi-user.target +EOF + +systemctl daemon-reload +systemctl enable ok_server.service diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs index 130759b0ea..57e8c5859d 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs @@ -44,6 +44,7 @@ pub async fn gen_install( GG_TRAEFIK_INSTANCE_NAME, datacenter_id, )?); + script.push(components::ok_server::install(initialize_immediately)); } backend::cluster::PoolType::Ats => { script.push(components::docker::install()); @@ -71,7 +72,10 @@ pub async fn gen_hook(server_token: &str) -> GlobalResult { // This script is templated on the server itself after fetching server data from the Rivet API (see gen_hook). // After being templated, it is run. -pub async fn gen_initialize(pool_type: backend::cluster::PoolType) -> GlobalResult { +pub async fn gen_initialize( + pool_type: backend::cluster::PoolType, + datacenter_id: Uuid, +) -> GlobalResult { let mut script = Vec::new(); let mut prometheus_targets = HashMap::new(); @@ -103,7 +107,7 @@ pub async fn gen_initialize(pool_type: backend::cluster::PoolType) -> GlobalResu components::traefik::Instance { name: GG_TRAEFIK_INSTANCE_NAME.to_string(), static_config: components::traefik::gg_static_config().await?, - dynamic_config: String::new(), + dynamic_config: components::traefik::gg_dynamic_config(datacenter_id)?, tcp_server_transports: Default::default(), }, )); diff --git a/svc/pkg/cluster/worker/src/workers/server_install/mod.rs b/svc/pkg/cluster/worker/src/workers/server_install/mod.rs index a0bbf9400d..4f47014552 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/mod.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/mod.rs @@ -81,7 +81,7 @@ async fn worker(ctx: &OperationContext) - ) .await?; let hook_script = install_scripts::gen_hook(server_token).await?; - let initialize_script = install_scripts::gen_initialize(pool_type).await?; + let initialize_script = install_scripts::gen_initialize(pool_type, datacenter_id).await?; // Spawn blocking thread for ssh (no async support) tokio::task::spawn_blocking(move || {