Skip to content

Commit 130ad40

Browse files
committed
fix(infra): gg tls certs timer & precreate tls dir
1 parent 8cad238 commit 130ad40

File tree

4 files changed

+36
-7
lines changed

4 files changed

+36
-7
lines changed

svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/rivet.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,25 @@ pub fn fetch_tls(
2626
initialize_immediately: bool,
2727
server_token: &str,
2828
traefik_instance_name: &str,
29+
datacenter_id: Uuid,
2930
) -> GlobalResult<String> {
3031
let mut script = include_str!("../files/rivet_fetch_tls.sh")
3132
.replace("__NAME__", traefik_instance_name)
3233
.replace("__SERVER_TOKEN__", server_token)
3334
.replace(
3435
"__TUNNEL_API_INTERNAL_PORT__",
3536
&TUNNEL_API_INTERNAL_PORT.to_string(),
36-
);
37+
)
38+
.replace("__DATACENTER_ID__", &datacenter_id.to_string());
3739

3840
if initialize_immediately {
39-
script.push_str("systemctl start rivet_fetch_tls.timer\n");
41+
// Start timer & run script immediately
42+
script.push_str(indoc!(
43+
"
44+
systemctl start rivet_fetch_tls.timer
45+
systemctl start --no-block rivet_fetch_tls.service
46+
"
47+
));
4048
}
4149

4250
Ok(script)

svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/rivet_fetch_tls.sh

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
# Create dir to hold TLS certs
2+
#
3+
# The Traefik install script also creates these directories (and chown them),
4+
# but we need the dirs to exist for the rivet_fetch_tls.sh script to run before
5+
# Traefik is installed when using initialize_immediately.
6+
mkdir -p /etc/__NAME__/dynamic/tls /etc/__NAME__/tls
7+
18
# Write script
29
cat << 'EOF' > /usr/bin/rivet_fetch_tls.sh
310
#!/usr/bin/env bash
@@ -6,12 +13,13 @@ set -eu -o pipefail
613
CERT_ID="job"
714
STUB="/etc/__NAME__/tls/$CERT_ID"
815
16+
917
# Retry script every 5 seconds
1018
while true; do
1119
response=$(
1220
curl -f \
1321
-H "Authorization: Bearer __SERVER_TOKEN__" \
14-
"http://127.0.0.1:__TUNNEL_API_INTERNAL_PORT__/provision/datacenters/___DATACENTER_ID___/tls"
22+
"http://127.0.0.1:__TUNNEL_API_INTERNAL_PORT__/provision/datacenters/__DATACENTER_ID__/tls"
1523
) && break || sleep 5
1624
done
1725
@@ -59,7 +67,12 @@ Requires=network-online.target
5967
After=network-online.target
6068
6169
[Timer]
62-
OnUnitInactiveSec=1h
70+
# Run immediately on startup
71+
OnBootSec=0
72+
# Trigger every hour
73+
OnCalendar=*:0
74+
# Prevent stampeding herd
75+
RandomizedDelaySec=60
6376
Unit=rivet_fetch_tls.service
6477
6578
[Install]

svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ pub async fn gen_install(
1414
pool_type: backend::cluster::PoolType,
1515
initialize_immediately: bool,
1616
server_token: &str,
17+
datacenter_id: Uuid,
1718
) -> GlobalResult<String> {
1819
// MARK: Common (pre)
1920
let mut script = vec![
@@ -41,6 +42,7 @@ pub async fn gen_install(
4142
initialize_immediately,
4243
server_token,
4344
GG_TRAEFIK_INSTANCE_NAME,
45+
datacenter_id,
4446
)?);
4547
}
4648
backend::cluster::PoolType::Ats => {

svc/pkg/cluster/worker/src/workers/server_install/mod.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ mod install_scripts;
1313

1414
#[worker(name = "cluster-server-install", timeout = 200)]
1515
async fn worker(ctx: &OperationContext<cluster::msg::server_install::Message>) -> GlobalResult<()> {
16+
let datacenter_id = unwrap!(ctx.datacenter_id).as_uuid();
17+
1618
// Check for stale message
1719
if ctx.req_dt() > util::duration::hours(1) {
1820
tracing::warn!("discarding stale message");
@@ -71,8 +73,13 @@ async fn worker(ctx: &OperationContext<cluster::msg::server_install::Message>) -
7173
.await?;
7274
let server_token = &unwrap_ref!(token_res.token).token;
7375

74-
let install_script =
75-
install_scripts::gen_install(pool_type, ctx.initialize_immediately, server_token).await?;
76+
let install_script = install_scripts::gen_install(
77+
pool_type,
78+
ctx.initialize_immediately,
79+
server_token,
80+
datacenter_id,
81+
)
82+
.await?;
7683
let hook_script = install_scripts::gen_hook(server_token).await?;
7784
let initialize_script = install_scripts::gen_initialize(pool_type).await?;
7885

@@ -161,7 +168,6 @@ async fn worker(ctx: &OperationContext<cluster::msg::server_install::Message>) -
161168
.await?;
162169

163170
// Scale to get rid of tainted servers
164-
let datacenter_id = unwrap_ref!(ctx.datacenter_id).as_uuid();
165171
msg!([ctx] @recursive cluster::msg::datacenter_scale(datacenter_id) {
166172
datacenter_id: ctx.datacenter_id,
167173
})

0 commit comments

Comments
 (0)