diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/rivet.rs b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/rivet.rs index dbb93b78f6..f6224703c2 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/rivet.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/rivet.rs @@ -26,6 +26,7 @@ pub fn fetch_tls( initialize_immediately: bool, server_token: &str, traefik_instance_name: &str, + datacenter_id: Uuid, ) -> GlobalResult { let mut script = include_str!("../files/rivet_fetch_tls.sh") .replace("__NAME__", traefik_instance_name) @@ -33,10 +34,17 @@ pub fn fetch_tls( .replace( "__TUNNEL_API_INTERNAL_PORT__", &TUNNEL_API_INTERNAL_PORT.to_string(), - ); + ) + .replace("__DATACENTER_ID__", &datacenter_id.to_string()); if initialize_immediately { - script.push_str("systemctl start rivet_fetch_tls.timer\n"); + // Start timer & run script immediately + script.push_str(indoc!( + " + systemctl start rivet_fetch_tls.timer + systemctl start --no-block rivet_fetch_tls.service + " + )); } Ok(script) diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/rivet_fetch_tls.sh b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/rivet_fetch_tls.sh index 04777bfddb..13f9d1d0c4 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/rivet_fetch_tls.sh +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/rivet_fetch_tls.sh @@ -1,3 +1,10 @@ +# Create dir to hold TLS certs +# +# The Traefik install script also creates these directories (and chown them), +# but we need the dirs to exist for the rivet_fetch_tls.sh script to run before +# Traefik is installed when using initialize_immediately. +mkdir -p /etc/__NAME__/dynamic/tls /etc/__NAME__/tls + # Write script cat << 'EOF' > /usr/bin/rivet_fetch_tls.sh #!/usr/bin/env bash @@ -6,12 +13,13 @@ set -eu -o pipefail CERT_ID="job" STUB="/etc/__NAME__/tls/$CERT_ID" + # Retry script every 5 seconds while true; do response=$( curl -f \ -H "Authorization: Bearer __SERVER_TOKEN__" \ - "http://127.0.0.1:__TUNNEL_API_INTERNAL_PORT__/provision/datacenters/___DATACENTER_ID___/tls" + "http://127.0.0.1:__TUNNEL_API_INTERNAL_PORT__/provision/datacenters/__DATACENTER_ID__/tls" ) && break || sleep 5 done @@ -59,7 +67,12 @@ Requires=network-online.target After=network-online.target [Timer] -OnUnitInactiveSec=1h +# Run immediately on startup +OnBootSec=0 +# Trigger every hour +OnCalendar=*:0 +# Prevent stampeding herd +RandomizedDelaySec=60 Unit=rivet_fetch_tls.service [Install] diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs index 9b5ad6a5a7..c09e1e40b5 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs @@ -14,6 +14,7 @@ pub async fn gen_install( pool_type: backend::cluster::PoolType, initialize_immediately: bool, server_token: &str, + datacenter_id: Uuid, ) -> GlobalResult { // MARK: Common (pre) let mut script = vec![ @@ -41,6 +42,7 @@ pub async fn gen_install( initialize_immediately, server_token, GG_TRAEFIK_INSTANCE_NAME, + datacenter_id, )?); } backend::cluster::PoolType::Ats => { diff --git a/svc/pkg/cluster/worker/src/workers/server_install/mod.rs b/svc/pkg/cluster/worker/src/workers/server_install/mod.rs index 9e7591bfa2..57ad012d4f 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/mod.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/mod.rs @@ -13,6 +13,8 @@ mod install_scripts; #[worker(name = "cluster-server-install", timeout = 200)] async fn worker(ctx: &OperationContext) -> GlobalResult<()> { + let datacenter_id = unwrap!(ctx.datacenter_id).as_uuid(); + // Check for stale message if ctx.req_dt() > util::duration::hours(1) { tracing::warn!("discarding stale message"); @@ -71,8 +73,13 @@ async fn worker(ctx: &OperationContext) - .await?; let server_token = &unwrap_ref!(token_res.token).token; - let install_script = - install_scripts::gen_install(pool_type, ctx.initialize_immediately, server_token).await?; + let install_script = install_scripts::gen_install( + pool_type, + ctx.initialize_immediately, + server_token, + datacenter_id, + ) + .await?; let hook_script = install_scripts::gen_hook(server_token).await?; let initialize_script = install_scripts::gen_initialize(pool_type).await?; @@ -161,7 +168,6 @@ async fn worker(ctx: &OperationContext) - .await?; // Scale to get rid of tainted servers - let datacenter_id = unwrap_ref!(ctx.datacenter_id).as_uuid(); msg!([ctx] @recursive cluster::msg::datacenter_scale(datacenter_id) { datacenter_id: ctx.datacenter_id, })