Skip to content

Commit

Permalink
fix(infra): gg tls certs timer & precreate tls dir (#812)
Browse files Browse the repository at this point in the history
<!-- Please make sure there is an issue that this PR is correlated to. -->

## Changes

<!-- If there are frontend changes, please include screenshots. -->
  • Loading branch information
NathanFlurry committed May 31, 2024
1 parent 4559152 commit b4b707e
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,25 @@ pub fn fetch_tls(
initialize_immediately: bool,
server_token: &str,
traefik_instance_name: &str,
datacenter_id: Uuid,
) -> GlobalResult<String> {
let mut script = include_str!("../files/rivet_fetch_tls.sh")
.replace("__NAME__", traefik_instance_name)
.replace("__SERVER_TOKEN__", server_token)
.replace(
"__TUNNEL_API_INTERNAL_PORT__",
&TUNNEL_API_INTERNAL_PORT.to_string(),
);
)
.replace("__DATACENTER_ID__", &datacenter_id.to_string());

if initialize_immediately {
script.push_str("systemctl start rivet_fetch_tls.timer\n");
// Start timer & run script immediately
script.push_str(indoc!(
"
systemctl start rivet_fetch_tls.timer
systemctl start --no-block rivet_fetch_tls.service
"
));
}

Ok(script)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# Create dir to hold TLS certs
#
# The Traefik install script also creates these directories (and chown them),
# but we need the dirs to exist for the rivet_fetch_tls.sh script to run before
# Traefik is installed when using initialize_immediately.
mkdir -p /etc/__NAME__/dynamic/tls /etc/__NAME__/tls

# Write script
cat << 'EOF' > /usr/bin/rivet_fetch_tls.sh
#!/usr/bin/env bash
Expand All @@ -6,12 +13,13 @@ set -eu -o pipefail
CERT_ID="job"
STUB="/etc/__NAME__/tls/$CERT_ID"
# Retry script every 5 seconds
while true; do
response=$(
curl -f \
-H "Authorization: Bearer __SERVER_TOKEN__" \
"http://127.0.0.1:__TUNNEL_API_INTERNAL_PORT__/provision/datacenters/___DATACENTER_ID___/tls"
"http://127.0.0.1:__TUNNEL_API_INTERNAL_PORT__/provision/datacenters/__DATACENTER_ID__/tls"
) && break || sleep 5
done
Expand Down Expand Up @@ -59,7 +67,12 @@ Requires=network-online.target
After=network-online.target
[Timer]
OnUnitInactiveSec=1h
# Run immediately on startup
OnBootSec=0
# Trigger every hour
OnCalendar=*:0
# Prevent stampeding herd
RandomizedDelaySec=60
Unit=rivet_fetch_tls.service
[Install]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub async fn gen_install(
pool_type: backend::cluster::PoolType,
initialize_immediately: bool,
server_token: &str,
datacenter_id: Uuid,
) -> GlobalResult<String> {
// MARK: Common (pre)
let mut script = vec![
Expand Down Expand Up @@ -41,6 +42,7 @@ pub async fn gen_install(
initialize_immediately,
server_token,
GG_TRAEFIK_INSTANCE_NAME,
datacenter_id,
)?);
}
backend::cluster::PoolType::Ats => {
Expand Down
12 changes: 9 additions & 3 deletions svc/pkg/cluster/worker/src/workers/server_install/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ mod install_scripts;

#[worker(name = "cluster-server-install", timeout = 200)]
async fn worker(ctx: &OperationContext<cluster::msg::server_install::Message>) -> GlobalResult<()> {
let datacenter_id = unwrap!(ctx.datacenter_id).as_uuid();

// Check for stale message
if ctx.req_dt() > util::duration::hours(1) {
tracing::warn!("discarding stale message");
Expand Down Expand Up @@ -71,8 +73,13 @@ async fn worker(ctx: &OperationContext<cluster::msg::server_install::Message>) -
.await?;
let server_token = &unwrap_ref!(token_res.token).token;

let install_script =
install_scripts::gen_install(pool_type, ctx.initialize_immediately, server_token).await?;
let install_script = install_scripts::gen_install(
pool_type,
ctx.initialize_immediately,
server_token,
datacenter_id,
)
.await?;
let hook_script = install_scripts::gen_hook(server_token).await?;
let initialize_script = install_scripts::gen_initialize(pool_type).await?;

Expand Down Expand Up @@ -161,7 +168,6 @@ async fn worker(ctx: &OperationContext<cluster::msg::server_install::Message>) -
.await?;

// Scale to get rid of tainted servers
let datacenter_id = unwrap_ref!(ctx.datacenter_id).as_uuid();
msg!([ctx] @recursive cluster::msg::datacenter_scale(datacenter_id) {
datacenter_id: ctx.datacenter_id,
})
Expand Down

0 comments on commit b4b707e

Please sign in to comment.