diff --git a/infra/default-builds/dockerfiles/test-mm-lobby-echo/Dockerfile b/infra/default-builds/dockerfiles/test-mm-lobby-echo/Dockerfile index b052729ee..c3971456b 100644 --- a/infra/default-builds/dockerfiles/test-mm-lobby-echo/Dockerfile +++ b/infra/default-builds/dockerfiles/test-mm-lobby-echo/Dockerfile @@ -1,4 +1,4 @@ -FROM clux/muslrust:1.77.2 AS build +FROM clux/muslrust:1.77.2-stable AS build RUN cargo new --bin /app WORKDIR /app COPY Cargo.toml ./ diff --git a/infra/tf/cloudflare_tunnels/k8s.tf b/infra/tf/cloudflare_tunnels/k8s.tf index b18539415..3444a2dc9 100644 --- a/infra/tf/cloudflare_tunnels/k8s.tf +++ b/infra/tf/cloudflare_tunnels/k8s.tf @@ -47,6 +47,8 @@ resource "kubernetes_deployment" "cloudflared" { } spec { + priority_class_name = "service-priority" + container { image = "cloudflare/cloudflared:2023.8.2" name = "cloudflared" diff --git a/infra/tf/grafana/grafana.tf b/infra/tf/grafana/grafana.tf index 1051062dd..b363998f3 100644 --- a/infra/tf/grafana/grafana.tf +++ b/infra/tf/grafana/grafana.tf @@ -30,6 +30,7 @@ resource "helm_release" "grafana" { chart = "grafana" version = "7.3.9" values = [yamlencode({ + priorityClassName = "monitoring-priority" "grafana.ini" = { auth = { disable_login_form = true diff --git a/infra/tf/k8s_cluster_aws/overprovisioning.tf b/infra/tf/k8s_cluster_aws/overprovisioning.tf new file mode 100644 index 000000000..49a0a10c5 --- /dev/null +++ b/infra/tf/k8s_cluster_aws/overprovisioning.tf @@ -0,0 +1,70 @@ +# Define low-priority deployments to maintain unallocated capacity on Karpenter +# cluster to prevent premption. +# +# Higher replicas + lower resoruces = more smaller pods can preempt +# Lower replicas + higher resources = larger pod can preempt, but will require bigger preempted "wedges" +# +# https://aws.amazon.com/blogs/containers/eliminate-kubernetes-node-scaling-lag-with-pod-priority-and-over-provisioning/ + +resource "kubernetes_namespace" "karpenter_overprovision" { + metadata { + name = "karpenter-overprovision" + } +} + +resource "kubernetes_priority_class" "overprovision_priority" { + metadata { + name = "overprovision-priority" + } + value = -1 +} + +resource "kubernetes_deployment" "overprovision" { + metadata { + name = "overprovision" + namespace = kubernetes_namespace.karpenter_overprovision.metadata.0.name + labels = { + app = "overprovisioning" + } + } + + spec { + replicas = 2 + + selector { + match_labels = { + app = "overprovisioning" + } + } + + template { + metadata { + labels = { + app = "overprovisioning" + } + } + + spec { + container { + name = "pause" + image = "registry.k8s.io/pause" + + resources { + requests = { + cpu = "1" + memory = "500Mi" + } + + limits = { + cpu = "1" + memory = "500Mi" + } + } + } + + priority_class_name = kubernetes_priority_class.overprovision_priority.metadata.0.name + } + } + } +} + diff --git a/infra/tf/k8s_infra/clickhouse.tf b/infra/tf/k8s_infra/clickhouse.tf index 24abddea6..fd04fe7c6 100644 --- a/infra/tf/k8s_infra/clickhouse.tf +++ b/infra/tf/k8s_infra/clickhouse.tf @@ -27,16 +27,6 @@ resource "kubernetes_namespace" "clickhouse" { } } -resource "kubernetes_priority_class" "clickhouse_priority" { - count = local.clickhouse_enabled ? 1 : 0 - - metadata { - name = "clickhouse-priority" - } - - value = 40 -} - resource "helm_release" "clickhouse" { count = local.clickhouse_enabled ? 1 : 0 depends_on = [null_resource.daemons] @@ -57,7 +47,7 @@ resource "helm_release" "clickhouse" { replicaCount = 1 } - priorityClassName = kubernetes_priority_class.clickhouse_priority.0.metadata.0.name + priorityClassName = kubernetes_priority_class.stateful_priority.metadata.0.name resources = var.limit_resources ? { limits = { memory = "${local.service_clickhouse.resources.memory}Mi" diff --git a/infra/tf/k8s_infra/cockroachdb.tf b/infra/tf/k8s_infra/cockroachdb.tf index 7f48e1d59..503db6c9a 100644 --- a/infra/tf/k8s_infra/cockroachdb.tf +++ b/infra/tf/k8s_infra/cockroachdb.tf @@ -24,14 +24,6 @@ resource "kubernetes_namespace" "cockroachdb" { } } -resource "kubernetes_priority_class" "cockroachdb_priority" { - metadata { - name = "cockroachdb-priority" - } - - value = 40 -} - # NOTE: Helm chart is no longer supported by CockroachDB. However, it's intended to be used only for development and it's the easiest to set up. resource "helm_release" "cockroachdb" { depends_on = [null_resource.daemons] @@ -46,7 +38,7 @@ resource "helm_release" "cockroachdb" { statefulset = { replicas = local.service_cockroachdb.count - priorityClassName = kubernetes_priority_class.cockroachdb_priority.metadata.0.name + priorityClassName = kubernetes_priority_class.stateful_priority.metadata.0.name resources = var.limit_resources ? { limits = { diff --git a/infra/tf/k8s_infra/imagor.tf b/infra/tf/k8s_infra/imagor.tf index ba1cfcff5..fa3d2dbc4 100644 --- a/infra/tf/k8s_infra/imagor.tf +++ b/infra/tf/k8s_infra/imagor.tf @@ -56,16 +56,6 @@ resource "kubernetes_namespace" "imagor" { } } -resource "kubernetes_priority_class" "imagor_priority" { - count = var.imagor_enabled ? 1 : 0 - - metadata { - name = "imagor-priority" - } - - value = 35 -} - resource "kubernetes_deployment" "imagor" { count = var.imagor_enabled ? 1 : 0 depends_on = [null_resource.daemons, module.docker_auth] @@ -92,7 +82,7 @@ resource "kubernetes_deployment" "imagor" { } spec { - priority_class_name = kubernetes_priority_class.imagor_priority.0.metadata.0.name + priority_class_name = kubernetes_priority_class.service_priority.metadata.0.name # MARK: Docker auth image_pull_secrets { diff --git a/infra/tf/k8s_infra/loki.tf b/infra/tf/k8s_infra/loki.tf index 603c9f047..556fad137 100644 --- a/infra/tf/k8s_infra/loki.tf +++ b/infra/tf/k8s_infra/loki.tf @@ -16,16 +16,6 @@ resource "kubernetes_namespace" "loki" { } } -resource "kubernetes_priority_class" "loki_priority" { - count = var.prometheus_enabled ? 1 : 0 - - metadata { - name = "loki-priority" - } - - value = 40 -} - resource "helm_release" "loki" { count = var.prometheus_enabled ? 1 : 0 @@ -36,7 +26,7 @@ resource "helm_release" "loki" { version = "5.36.0" values = [yamlencode({ global = { - priorityClassName = kubernetes_priority_class.loki_priority.0.metadata.0.name + priorityClassName = kubernetes_priority_class.monitoring_priority.metadata.0.name } loki = { auth_enabled = false diff --git a/infra/tf/k8s_infra/minio.tf b/infra/tf/k8s_infra/minio.tf index ab35ab19f..de9cf3245 100644 --- a/infra/tf/k8s_infra/minio.tf +++ b/infra/tf/k8s_infra/minio.tf @@ -26,14 +26,6 @@ module "minio_secrets" { optional = true } -resource "kubernetes_priority_class" "minio_priority" { - metadata { - name = "minio-priority" - } - - value = 40 -} - resource "helm_release" "minio" { depends_on = [null_resource.daemons] count = local.has_minio ? 1 : 0 @@ -48,7 +40,7 @@ resource "helm_release" "minio" { storageClass = var.k8s_storage_class } replicaCount = local.service_minio.count - priorityClassName = kubernetes_priority_class.minio_priority.metadata.0.name + priorityClassName = kubernetes_priority_class.service_priority.metadata.0.name resources = var.limit_resources ? { limits = { memory = "${local.service_minio.resources.memory}Mi" diff --git a/infra/tf/k8s_infra/nats.tf b/infra/tf/k8s_infra/nats.tf index fbc7d3e34..b6ca9af90 100644 --- a/infra/tf/k8s_infra/nats.tf +++ b/infra/tf/k8s_infra/nats.tf @@ -14,14 +14,7 @@ resource "kubernetes_namespace" "nats" { } } -resource "kubernetes_priority_class" "nats_priority" { - metadata { - name = "nats-priority" - } - - value = 40 -} - +# TODO(RVTEE-105): Fix priority class resource "helm_release" "nats" { depends_on = [null_resource.daemons] @@ -37,11 +30,6 @@ resource "helm_release" "nats" { replicas = local.service_nats.count } } - podTemplate = { - merge = { - priorityClassName = kubernetes_priority_class.nats_priority.metadata.0.name - } - } container = { env = { # See https://artifacthub.io/packages/helm/grafana/grafana#nats-container-resources diff --git a/infra/tf/k8s_infra/nomad.tf b/infra/tf/k8s_infra/nomad.tf index 3e3d3e741..62dc8bcb3 100644 --- a/infra/tf/k8s_infra/nomad.tf +++ b/infra/tf/k8s_infra/nomad.tf @@ -190,16 +190,6 @@ resource "kubectl_manifest" "nomad_server_monitor" { }) } -resource "kubernetes_priority_class" "nomad_priority" { - count = var.edge_enabled ? 1 : 0 - - metadata { - name = "nomad-priority" - } - - value = 40 -} - resource "kubernetes_stateful_set" "nomad_server" { count = var.edge_enabled ? 1 : 0 depends_on = [null_resource.daemons] @@ -234,7 +224,7 @@ resource "kubernetes_stateful_set" "nomad_server" { } spec { - priority_class_name = kubernetes_priority_class.nomad_priority.0.metadata.0.name + priority_class_name = kubernetes_priority_class.stateful_priority.metadata.0.name security_context { run_as_user = 0 diff --git a/infra/tf/k8s_infra/nsfw_api.tf b/infra/tf/k8s_infra/nsfw_api.tf index 1bc2770b2..63a62dbf1 100644 --- a/infra/tf/k8s_infra/nsfw_api.tf +++ b/infra/tf/k8s_infra/nsfw_api.tf @@ -16,16 +16,6 @@ resource "kubernetes_namespace" "nsfw_api" { } } -resource "kubernetes_priority_class" "nsfw_api_priority" { - count = var.nsfw_api_enabled ? 1 : 0 - - metadata { - name = "nsfw-api-priority" - } - - value = 40 -} - resource "kubernetes_deployment" "nsfw_api" { count = var.nsfw_api_enabled ? 1 : 0 depends_on = [null_resource.daemons, module.docker_auth] @@ -52,7 +42,7 @@ resource "kubernetes_deployment" "nsfw_api" { } spec { - priority_class_name = kubernetes_priority_class.nsfw_api_priority.0.metadata.0.name + priority_class_name = kubernetes_priority_class.service_priority.metadata.0.name # MARK: Docker auth image_pull_secrets { diff --git a/infra/tf/k8s_infra/priority_class.tf b/infra/tf/k8s_infra/priority_class.tf new file mode 100644 index 000000000..48ef405b9 --- /dev/null +++ b/infra/tf/k8s_infra/priority_class.tf @@ -0,0 +1,41 @@ +# Used for services that can be preempted often. +# +# We set almost everything to this class (even crucial infrastrcuture) because +# we need to wait for Karpenter to boot a new node instead of shutting down +# existing services. +resource "kubernetes_priority_class" "service_priority" { + + metadata { + name = "service-priority" + } + value = 50 +} + +# Used for anything required to monitor the other services. These should take +# priority no matter what in order to ensure we have visibility on what's going +# on. +resource "kubernetes_priority_class" "monitoring_priority" { + + metadata { + name = "monitoring-priority" + } + value = 50 +} + +# Used for anything stateful that should not be frequently preempted. +resource "kubernetes_priority_class" "stateful_priority" { + + metadata { + name = "stateful-priority" + } + value = 60 +} + +# Used for daemons that run on the machines and need to be scheduled no matter what. +resource "kubernetes_priority_class" "daemon_priority" { + + metadata { + name = "daemon-priority" + } + value = 90 +} diff --git a/infra/tf/k8s_infra/prometheus.tf b/infra/tf/k8s_infra/prometheus.tf index f9135dddc..2a86398dc 100644 --- a/infra/tf/k8s_infra/prometheus.tf +++ b/infra/tf/k8s_infra/prometheus.tf @@ -86,25 +86,6 @@ resource "kubernetes_namespace" "prometheus" { } } -# Set a high priority for Node Exporter so it can run on all nodes -resource "kubernetes_priority_class" "node_exporter_priority" { - count = var.prometheus_enabled ? 1 : 0 - - metadata { - name = "node-exporter-priority" - } - value = 90 -} - -resource "kubernetes_priority_class" "prometheus_priority" { - count = var.prometheus_enabled ? 1 : 0 - - metadata { - name = "prometheus-priority" - } - value = 40 -} - resource "helm_release" "prometheus" { count = var.prometheus_enabled ? 1 : 0 depends_on = [helm_release.vpa] @@ -122,7 +103,7 @@ resource "helm_release" "prometheus" { cpu = "${local.service_node_exporter.resources.cpu}m" } } : null - priorityClassName = kubernetes_priority_class.node_exporter_priority.0.metadata.0.name + priorityClassName = kubernetes_priority_class.daemon_priority.metadata.0.name affinity = { nodeAffinity = { requiredDuringSchedulingIgnoredDuringExecution = { @@ -270,7 +251,7 @@ resource "helm_release" "prometheus" { } } - priorityClassName = kubernetes_priority_class.prometheus_priority.0.metadata.0.name + priorityClassName = kubernetes_priority_class.monitoring_priority.metadata.0.name resources = var.limit_resources ? { limits = { memory = "${local.service_prometheus.resources.memory}Mi" diff --git a/infra/tf/k8s_infra/promtail.tf b/infra/tf/k8s_infra/promtail.tf index 46bdf1945..5ce245a39 100644 --- a/infra/tf/k8s_infra/promtail.tf +++ b/infra/tf/k8s_infra/promtail.tf @@ -16,15 +16,6 @@ resource "kubernetes_namespace" "promtail" { } } -resource "kubernetes_priority_class" "promtail_priority" { - count = var.prometheus_enabled ? 1 : 0 - - metadata { - name = "promtail-priority" - } - value = 40 -} - resource "helm_release" "promtail" { count = var.prometheus_enabled ? 1 : 0 @@ -146,7 +137,7 @@ resource "helm_release" "promtail" { } } - priorityClassName = kubernetes_priority_class.promtail_priority.0.metadata.0.name + priorityClassName = kubernetes_priority_class.daemon_priority.metadata.0.name resources = var.limit_resources ? { limits = { memory = "${local.service_promtail.resources.memory}Mi" diff --git a/infra/tf/k8s_infra/redis.tf b/infra/tf/k8s_infra/redis.tf index 759a62180..2003a08af 100644 --- a/infra/tf/k8s_infra/redis.tf +++ b/infra/tf/k8s_infra/redis.tf @@ -20,7 +20,7 @@ locals { redis_node_config = { for k, v in var.redis_dbs: k => { - priorityClassName = kubernetes_priority_class.redis_priority.metadata.0.name + priorityClassName = kubernetes_priority_class.stateful_priority.metadata.0.name resources = var.limit_resources ? { limits = { memory = "${local.service_redis.resources.memory}Mi" @@ -53,13 +53,6 @@ resource "kubernetes_namespace" "redis" { } } -resource "kubernetes_priority_class" "redis_priority" { - metadata { - name = "redis-priority" - } - value = 40 -} - resource "helm_release" "redis" { depends_on = [null_resource.daemons] for_each = local.redis_svcs diff --git a/infra/tf/k8s_infra/traefik.tf b/infra/tf/k8s_infra/traefik.tf index d2a4af33b..b6597b466 100644 --- a/infra/tf/k8s_infra/traefik.tf +++ b/infra/tf/k8s_infra/traefik.tf @@ -23,13 +23,6 @@ locals { }) } -resource "kubernetes_priority_class" "traefik_priority" { - metadata { - name = "traefik-priority" - } - value = 40 -} - resource "helm_release" "traefik" { depends_on = [null_resource.daemons] @@ -56,7 +49,7 @@ resource "helm_release" "traefik" { "traefik-instance" = "main" } - priorityClassName = kubernetes_priority_class.traefik_priority.metadata.0.name + priorityClassName = kubernetes_priority_class.service_priority.metadata.0.name resources = var.limit_resources ? { limits = { memory = "${local.service_traefik.resources.memory}Mi" diff --git a/infra/tf/k8s_infra/traefik_tunnel.tf b/infra/tf/k8s_infra/traefik_tunnel.tf index 128d68b16..f179d458e 100644 --- a/infra/tf/k8s_infra/traefik_tunnel.tf +++ b/infra/tf/k8s_infra/traefik_tunnel.tf @@ -68,15 +68,6 @@ resource "kubernetes_namespace" "traefik_tunnel" { } } -resource "kubernetes_priority_class" "traefik_tunnel_priority" { - count = var.edge_enabled ? 1 : 0 - - metadata { - name = "traefik-tunnel-priority" - } - value = 40 -} - resource "helm_release" "traefik_tunnel" { count = var.edge_enabled ? 1 : 0 @@ -114,7 +105,7 @@ resource "helm_release" "traefik_tunnel" { } } - priorityClassName = kubernetes_priority_class.traefik_tunnel_priority.0.metadata.0.name + priorityClassName = kubernetes_priority_class.service_priority.metadata.0.name tlsOptions = { "ingress-tunnel" = { diff --git a/infra/tf/k8s_infra/traffic_server.tf b/infra/tf/k8s_infra/traffic_server.tf index 250451765..c37ae945d 100644 --- a/infra/tf/k8s_infra/traffic_server.tf +++ b/infra/tf/k8s_infra/traffic_server.tf @@ -107,13 +107,6 @@ resource "kubernetes_service" "traffic_server" { } } -resource "kubernetes_priority_class" "traffic_server_priority" { - metadata { - name = "traffic-server-priority" - } - value = 40 -} - resource "kubernetes_stateful_set" "traffic_server" { depends_on = [null_resource.daemons, module.docker_auth] @@ -148,7 +141,7 @@ resource "kubernetes_stateful_set" "traffic_server" { } spec { - priority_class_name = kubernetes_priority_class.traffic_server_priority.metadata.0.name + priority_class_name = kubernetes_priority_class.service_priority.metadata.0.name security_context { run_as_user = 1000 diff --git a/infra/tf/vector/vector.tf b/infra/tf/vector/vector.tf index a81c6cc52..e8b635cf3 100644 --- a/infra/tf/vector/vector.tf +++ b/infra/tf/vector/vector.tf @@ -10,13 +10,6 @@ locals { clickhouse_k8s = var.clickhouse_enabled && var.clickhouse_provider == "kubernetes" } -resource "kubernetes_priority_class" "vector_priority" { - metadata { - name = "vector-priority" - } - value = 40 -} - module "secrets" { source = "../modules/secrets" @@ -33,7 +26,7 @@ resource "helm_release" "vector" { version = "0.29.0" values = [yamlencode({ role = "Aggregator" - podPriorityClassName = kubernetes_priority_class.vector_priority.metadata.0.name + podPriorityClassName = "service-priority" resources = var.limit_resources ? { limits = { memory = "${local.service_vector.resources.memory}Mi" diff --git a/lib/bolt/core/src/dep/k8s/gen.rs b/lib/bolt/core/src/dep/k8s/gen.rs index 1b5e9a92e..ae7501160 100644 --- a/lib/bolt/core/src/dep/k8s/gen.rs +++ b/lib/bolt/core/src/dep/k8s/gen.rs @@ -364,18 +364,6 @@ pub async fn gen_svc(exec_ctx: &ExecServiceContext) -> Vec { } }); - // Create priority class - let priority_class_name = format!("{}-priority", service_name); - specs.push(json!({ - "apiVersion": "scheduling.k8s.io/v1", - "kind": "PriorityClass", - "metadata": { - "name": priority_class_name, - "namespace": "rivet-service" - }, - "value": svc_ctx.config().service.priority() - })); - let restart_policy = if matches!(run_context, RunContext::Test { .. }) { "Never" } else if spec_type == SpecType::Deployment { @@ -395,7 +383,7 @@ pub async fn gen_svc(exec_ctx: &ExecServiceContext) -> Vec { config::ns::ClusterKind::Distributed { .. } => 30, }; let pod_spec = json!({ - "priorityClassName": priority_class_name, + "priorityClassName": "service-priority", "restartPolicy": restart_policy, "terminationGracePeriodSeconds": termination_grace_period, "imagePullSecrets": [{ diff --git a/lib/job-runner/Dockerfile b/lib/job-runner/Dockerfile index e29d332fa..87e9b4890 100644 --- a/lib/job-runner/Dockerfile +++ b/lib/job-runner/Dockerfile @@ -1,4 +1,4 @@ -FROM clux/muslrust:1.77.2 +FROM clux/muslrust:1.77.2-stable WORKDIR /app COPY Cargo.toml Cargo.lock .