diff --git a/infra/tf/k8s_infra/nomad.tf b/infra/tf/k8s_infra/nomad.tf index d6779f328..bdc5a6370 100644 --- a/infra/tf/k8s_infra/nomad.tf +++ b/infra/tf/k8s_infra/nomad.tf @@ -13,14 +13,9 @@ # complicated + adds another point of failure and (b) it doesn't fix the problem with Nomad server addresses changing. locals { - # !!! DO NOT CHANGE !!! - # - # This value must be 3, 5, or 7. More = better redundancy, but does not make things faster. - # - # See https://developer.hashicorp.com/nomad/tutorials/enterprise/production-reference-architecture-vm-with-consul nomad_server_count = var.deploy_method_cluster ? 3 : 1 - nomad_server_addrs = [for i in range(0, local.nomad_server_count): "127.0.0.1:${6000 + i}"] + nomad_server_addrs = [for i in range(0, var.nomad_server_count): "127.0.0.1:${6000 + i}"] nomad_server_addrs_escaped = [for addr in local.nomad_server_addrs : "\"${addr}\""] nomad_server_configmap_data = { "server.hcl" = <<-EOT @@ -36,7 +31,7 @@ locals { server { enabled = true - bootstrap_expect = ${local.nomad_server_count} + bootstrap_expect = ${var.nomad_server_count} server_join { retry_join = [${join(", ", local.nomad_server_addrs_escaped)}] @@ -128,7 +123,7 @@ resource "kubernetes_service" "nomad_server" { } resource "kubernetes_service" "nomad_server_indexed" { - count = var.edge_enabled ? local.nomad_server_count : 0 + count = var.edge_enabled ? var.nomad_server_count : 0 metadata { namespace = kubernetes_namespace.nomad.0.metadata.0.name @@ -202,7 +197,7 @@ resource "kubernetes_stateful_set" "nomad_server" { } } spec { - replicas = local.nomad_server_count + replicas = var.nomad_server_count selector { match_labels = { @@ -324,7 +319,7 @@ resource "kubernetes_stateful_set" "nomad_server" { # Entrypoints flatten([ - for i in range(0, local.nomad_server_count): + for i in range(0, var.nomad_server_count): [ "--entryPoints.nomad-${i}-rpc-tcp.address=:${5000 + i}/tcp", "--entryPoints.nomad-${i}-serf-tcp.address=:${6000 + i}/tcp", @@ -334,7 +329,7 @@ resource "kubernetes_stateful_set" "nomad_server" { ]) dynamic "port" { - for_each = [for i in range(0, local.nomad_server_count) : i] + for_each = [for i in range(0, var.nomad_server_count) : i] content { name = "n-${port.value}-rpc-tcp" container_port = 5000 + port.value @@ -343,7 +338,7 @@ resource "kubernetes_stateful_set" "nomad_server" { } dynamic "port" { - for_each = [for i in range(0, local.nomad_server_count) : i] + for_each = [for i in range(0, var.nomad_server_count) : i] content { name = "n-${port.value}-serf-tcp" container_port = 6000 + port.value @@ -352,7 +347,7 @@ resource "kubernetes_stateful_set" "nomad_server" { } dynamic "port" { - for_each = [for i in range(0, local.nomad_server_count) : i] + for_each = [for i in range(0, var.nomad_server_count) : i] content { name = "n-${port.value}-serf-udp" container_port = 6000 + port.value @@ -421,7 +416,7 @@ resource "kubernetes_config_map" "nomad_server_sidecar_traefik_config" { } data = { - for i in range(0, local.nomad_server_count): + for i in range(0, var.nomad_server_count): "nomad-${i}.yaml" => yamlencode({ tcp = { routers = { diff --git a/infra/tf/k8s_infra/vars.tf b/infra/tf/k8s_infra/vars.tf index 88521cb63..dab91aace 100644 --- a/infra/tf/k8s_infra/vars.tf +++ b/infra/tf/k8s_infra/vars.tf @@ -60,6 +60,10 @@ variable "authenticate_all_docker_hub_pulls" { } # MARK: Nomad +variable "nomad_server_count" { + type = number +} + variable "edge_enabled" { type = bool } diff --git a/lib/bolt/core/src/context/project.rs b/lib/bolt/core/src/context/project.rs index e6b42eb51..033aaf4c1 100644 --- a/lib/bolt/core/src/context/project.rs +++ b/lib/bolt/core/src/context/project.rs @@ -900,6 +900,18 @@ impl ProjectContextData { .and_then(|dns| dns.provider.as_ref()) .is_some() } + + pub fn nomad_server_count(&self) -> usize { + // !!! DO NOT CHANGE !!! + // + // This value must be 1, 3, 5, or 7. More = better redundancy, but does not make things faster. + // + // See https://developer.hashicorp.com/nomad/tutorials/enterprise/production-reference-architecture-vm-with-consul + match self.ns().cluster.kind { + config::ns::ClusterKind::Distributed { .. } => 3, + config::ns::ClusterKind::SingleNode { .. } => 1, + } + } } pub struct S3Credentials { diff --git a/lib/bolt/core/src/context/service.rs b/lib/bolt/core/src/context/service.rs index 7dd2ddffa..f5277faaa 100644 --- a/lib/bolt/core/src/context/service.rs +++ b/lib/bolt/core/src/context/service.rs @@ -1047,6 +1047,12 @@ impl ServiceContextData { env.insert("RIVET_PROFANITY_FILTER_DISABLE".into(), "1".into()); } + // Nomad + env.insert( + "NOMAD_SERVER_COUNT".into(), + project_ctx.nomad_server_count().to_string(), + ); + if let Some(provisioning) = &project_ctx.ns().rivet.provisioning { if self.depends_on_cluster_config() || matches!(run_context, RunContext::Test { .. }) { env.insert( @@ -1302,7 +1308,6 @@ impl ServiceContextData { // if self.depends_on_infra() && project_ctx.ns().rivet.provisioning.is_some() { let tls = terraform::output::read_tls(&project_ctx).await; - let k8s_infra = terraform::output::read_k8s_infra(&project_ctx).await; env.insert( "TLS_CERT_LOCALLY_SIGNED_JOB_CERT_PEM".into(), diff --git a/lib/bolt/core/src/dep/terraform/gen.rs b/lib/bolt/core/src/dep/terraform/gen.rs index 15d0d10ca..6636a49a4 100644 --- a/lib/bolt/core/src/dep/terraform/gen.rs +++ b/lib/bolt/core/src/dep/terraform/gen.rs @@ -224,6 +224,7 @@ async fn vars(ctx: &ProjectContext) { } // Edge nodes + vars.insert("nomad_server_count".into(), json!(ctx.nomad_server_count())); vars.insert( "edge_enabled".into(), json!(config.rivet.provisioning.is_some()), diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/nomad.rs b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/nomad.rs index acaaf99b9..dfd8e3a7b 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/nomad.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/components/nomad.rs @@ -4,10 +4,13 @@ pub fn install() -> String { include_str!("../files/nomad_install.sh").to_string() } -pub fn configure() -> String { - let servers = &["127.0.0.1:5000", "127.0.0.1:5001", "127.0.0.1:5002"]; +pub fn configure() -> GlobalResult { + let nomad_server_count = util::env::var("NOMAD_SERVER_COUNT")?.parse::()?; + let servers = (0..nomad_server_count) + .map(|idx| format!("127.0.0.1:{}", 5000 + idx)) + .collect::>(); - include_str!("../files/nomad_configure.sh") + Ok(include_str!("../files/nomad_configure.sh") // HACK: Hardcoded to Linode .replace("__PUBLIC_IFACE__", "eth0") // HACK: Hardcoded to Linode @@ -27,5 +30,5 @@ pub fn configure() -> String { .replace( "__ATS_VLAN_SUBNET__", &util::net::ats::vlan_ip_net().to_string(), - ) + )) } diff --git a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs index 57e8c5859..cc6effaa0 100644 --- a/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs +++ b/svc/pkg/cluster/worker/src/workers/server_install/install_scripts/mod.rs @@ -92,7 +92,7 @@ pub async fn gen_initialize( // MARK: Specific pool components match pool_type { backend::cluster::PoolType::Job => { - script.push(components::nomad::configure()); + script.push(components::nomad::configure()?); prometheus_targets.insert( "nomad".into(),