Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(clusters): gg monitor for better uptime #921

Merged
merged 1 commit into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions infra/tf/better_uptime/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ resource "betteruptime_monitor" "monitor" {
]
email = var.better_uptime_notify
push = var.better_uptime_notify
verify_ssl = try(each.value.monitor.verify_ssl, false)
ssl_expiration = try(each.value.monitor.verify_ssl, false) ? 14 : null
}

resource "betteruptime_status_page_resource" "status_page_resource" {
Expand Down
1 change: 1 addition & 0 deletions infra/tf/better_uptime/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ variable "better_uptime_groups" {
id = string
url = string
public_name = string
verify_ssl = optional(bool)
}))
}))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum by (pool_type, datacenter_id) (provision_draining_tainted_servers{cluster_id=~\"[[cluster_id]]\", datacenter_id=~\"[[datacenter_id]]\", provider_datacenter_id=~\"[[provider_datacenter_id]]\", pool_type=~\"[[pool_type]]\"})",
"expr": "sum by (pool_type, datacenter_id) (rivet_provision_draining_tainted_servers{cluster_id=~\"[[cluster_id]]\", datacenter_id=~\"[[datacenter_id]]\", provider_datacenter_id=~\"[[provider_datacenter_id]]\", pool_type=~\"[[pool_type]]\"})",
"instant": false,
"legendFormat": "{{pool_type}} ({{datacenter_id}})",
"range": true,
Expand Down
23 changes: 21 additions & 2 deletions lib/bolt/core/src/dep/terraform/gen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ async fn vars(ctx: &ProjectContext) {
};

// Create monitors
let mm_monitors = cluster
let api_status_monitors = cluster
.datacenters
.iter()
.map(|(name_id, dc)| {
Expand All @@ -555,14 +555,33 @@ async fn vars(ctx: &ProjectContext) {
})
})
.collect::<Vec<_>>();
let gg_monitors = if let Some(domain_job) = ctx.domain_job() {
cluster
.datacenters
.values()
.map(|dc| {
json!({
"id": format!("{}-gg", dc.datacenter_id),
"url": format!("https://lobby.{}.{domain_job}/status", dc.datacenter_id),
"public_name": format!("{} (GG)", dc.display_name),
"verify_ssl": true,
})
})
.collect::<Vec<_>>()
} else {
Vec::new()
};

vars.insert(
"better_uptime_groups".into(),
json!([
{
"id": "mm",
"name": "Matchmaker",
"monitors": mm_monitors,
"monitors": api_status_monitors
.into_iter()
.chain(gg_monitors)
.collect::<Vec<_>>(),
},
{
"id": "cdn",
Expand Down
2 changes: 1 addition & 1 deletion svc/api/cloud/src/route/devices/links.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ pub async fn complete(
) -> GlobalResult<serde_json::Value> {
// Verify completer is a user. Cloud tokens should not be able to link other
// cloud tokens.
let rivet_claims::ent::User { .. } = ctx.auth().claims()?.as_user()?;
ctx.auth().claims()?.as_user()?;

// Verify has access to game
ctx.auth()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use indoc::indoc;

pub mod nomad;
pub mod ok_server;
pub mod rivet;
pub mod s3;
pub mod traefik;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
pub const OK_SERVER_PORT: usize = 9999;

pub fn install(initialize_immediately: bool) -> String {
let mut script = include_str!("../files/ok_server.sh")
.replace("__OK_SERVER_PORT__", &OK_SERVER_PORT.to_string());

if initialize_immediately {
// Run script immediately
script.push_str("systemctl start --no-block ok_server.service");
}

script
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use chirp_worker::prelude::*;
use indoc::formatdoc;

use super::{
ok_server::OK_SERVER_PORT,
vector::{TUNNEL_VECTOR_PORT, TUNNEL_VECTOR_TCP_JSON_PORT},
TUNNEL_API_INTERNAL_PORT,
};
Expand Down Expand Up @@ -282,3 +283,31 @@ pub async fn gg_static_config() -> GlobalResult<String> {

Ok(config)
}

pub fn gg_dynamic_config(datacenter_id: Uuid) -> GlobalResult<String> {
let domain_job = unwrap!(util::env::domain_job(), "dns not enabled");

let main = format!("{datacenter_id}.{domain_job}");

Ok(formatdoc!(
NathanFlurry marked this conversation as resolved.
Show resolved Hide resolved
r#"
# Always returns 200 at /status
[http.routers.ok-status]
entryPoints = ["lb-80"]
rule = "Host(`lobby.{main}`) && Path(`/status`)"
service = "ok-service"

[http.routers.ok-status-secure]
entryPoints = ["lb-443"]
rule = "Host(`lobby.{main}`) && Path(`/status`)"
service = "ok-service"
[[http.routers.ok-status-secure.tls.domains]]
main = "{main}"
sans = []

[http.services.ok-service.loadBalancer]
[[http.services.ok-service.loadBalancer.servers]]
url = "http://127.0.0.1:{OK_SERVER_PORT}"
"#
))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Write script
cat << 'EOF' > /usr/bin/ok_server.sh
#!/bin/bash
set -e

trap "exit" INT
while true; do
{ echo -e 'HTTP/1.1 200 OK\r\n\r\n'; } | nc -l -p __OK_SERVER_PORT__ -q 0;
done
EOF

chmod +x /usr/bin/ok_server.sh

# Create systemd service file
cat << 'EOF' > /etc/systemd/system/ok_server.service
[Unit]
Description=Rivet Ok Server
Requires=network-online.target
After=network-online.target

[Service]
User=root
Group=root
Type=oneshot
ExecStart=/usr/bin/ok_server.sh
Type=simple

[Install]
WantedBy=multi-user.target
EOF

systemctl daemon-reload
systemctl enable ok_server.service
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ pub async fn gen_install(
GG_TRAEFIK_INSTANCE_NAME,
datacenter_id,
)?);
script.push(components::ok_server::install(initialize_immediately));
}
backend::cluster::PoolType::Ats => {
script.push(components::docker::install());
Expand Down Expand Up @@ -71,7 +72,10 @@ pub async fn gen_hook(server_token: &str) -> GlobalResult<String> {

// This script is templated on the server itself after fetching server data from the Rivet API (see gen_hook).
// After being templated, it is run.
pub async fn gen_initialize(pool_type: backend::cluster::PoolType) -> GlobalResult<String> {
pub async fn gen_initialize(
pool_type: backend::cluster::PoolType,
datacenter_id: Uuid,
) -> GlobalResult<String> {
let mut script = Vec::new();

let mut prometheus_targets = HashMap::new();
Expand Down Expand Up @@ -103,7 +107,7 @@ pub async fn gen_initialize(pool_type: backend::cluster::PoolType) -> GlobalResu
components::traefik::Instance {
name: GG_TRAEFIK_INSTANCE_NAME.to_string(),
static_config: components::traefik::gg_static_config().await?,
dynamic_config: String::new(),
dynamic_config: components::traefik::gg_dynamic_config(datacenter_id)?,
tcp_server_transports: Default::default(),
},
));
Expand Down
2 changes: 1 addition & 1 deletion svc/pkg/cluster/worker/src/workers/server_install/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ async fn worker(ctx: &OperationContext<cluster::msg::server_install::Message>) -
)
.await?;
let hook_script = install_scripts::gen_hook(server_token).await?;
let initialize_script = install_scripts::gen_initialize(pool_type).await?;
let initialize_script = install_scripts::gen_initialize(pool_type, datacenter_id).await?;

// Spawn blocking thread for ssh (no async support)
tokio::task::spawn_blocking(move || {
Expand Down
Loading