Skip to content

Commit

Permalink
chore(k8s): update priority classes to play nice with karpenter & pre…
Browse files Browse the repository at this point in the history
…emption (#801)

<!-- Please make sure there is an issue that this PR is correlated to. -->

## Changes

<!-- If there are frontend changes, please include screenshots. -->
  • Loading branch information
NathanFlurry committed May 31, 2024
1 parent 1976900 commit 7b17295
Show file tree
Hide file tree
Showing 22 changed files with 133 additions and 174 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM clux/muslrust:1.77.2 AS build
FROM clux/muslrust:1.77.2-stable AS build
RUN cargo new --bin /app
WORKDIR /app
COPY Cargo.toml ./
Expand Down
2 changes: 2 additions & 0 deletions infra/tf/cloudflare_tunnels/k8s.tf
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ resource "kubernetes_deployment" "cloudflared" {
}

spec {
priority_class_name = "service-priority"

container {
image = "cloudflare/cloudflared:2023.8.2"
name = "cloudflared"
Expand Down
1 change: 1 addition & 0 deletions infra/tf/grafana/grafana.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ resource "helm_release" "grafana" {
chart = "grafana"
version = "7.3.9"
values = [yamlencode({
priorityClassName = "monitoring-priority"
"grafana.ini" = {
auth = {
disable_login_form = true
Expand Down
70 changes: 70 additions & 0 deletions infra/tf/k8s_cluster_aws/overprovisioning.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Define low-priority deployments to maintain unallocated capacity on Karpenter
# cluster to prevent premption.
#
# Higher replicas + lower resoruces = more smaller pods can preempt
# Lower replicas + higher resources = larger pod can preempt, but will require bigger preempted "wedges"
#
# https://aws.amazon.com/blogs/containers/eliminate-kubernetes-node-scaling-lag-with-pod-priority-and-over-provisioning/

resource "kubernetes_namespace" "karpenter_overprovision" {
metadata {
name = "karpenter-overprovision"
}
}

resource "kubernetes_priority_class" "overprovision_priority" {
metadata {
name = "overprovision-priority"
}
value = -1
}

resource "kubernetes_deployment" "overprovision" {
metadata {
name = "overprovision"
namespace = kubernetes_namespace.karpenter_overprovision.metadata.0.name
labels = {
app = "overprovisioning"
}
}

spec {
replicas = 2

selector {
match_labels = {
app = "overprovisioning"
}
}

template {
metadata {
labels = {
app = "overprovisioning"
}
}

spec {
container {
name = "pause"
image = "registry.k8s.io/pause"

resources {
requests = {
cpu = "1"
memory = "500Mi"
}

limits = {
cpu = "1"
memory = "500Mi"
}
}
}

priority_class_name = kubernetes_priority_class.overprovision_priority.metadata.0.name
}
}
}
}

12 changes: 1 addition & 11 deletions infra/tf/k8s_infra/clickhouse.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,6 @@ resource "kubernetes_namespace" "clickhouse" {
}
}

resource "kubernetes_priority_class" "clickhouse_priority" {
count = local.clickhouse_enabled ? 1 : 0

metadata {
name = "clickhouse-priority"
}

value = 40
}

resource "helm_release" "clickhouse" {
count = local.clickhouse_enabled ? 1 : 0
depends_on = [null_resource.daemons]
Expand All @@ -57,7 +47,7 @@ resource "helm_release" "clickhouse" {
replicaCount = 1
}

priorityClassName = kubernetes_priority_class.clickhouse_priority.0.metadata.0.name
priorityClassName = kubernetes_priority_class.stateful_priority.metadata.0.name
resources = var.limit_resources ? {
limits = {
memory = "${local.service_clickhouse.resources.memory}Mi"
Expand Down
10 changes: 1 addition & 9 deletions infra/tf/k8s_infra/cockroachdb.tf
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,6 @@ resource "kubernetes_namespace" "cockroachdb" {
}
}

resource "kubernetes_priority_class" "cockroachdb_priority" {
metadata {
name = "cockroachdb-priority"
}

value = 40
}

# NOTE: Helm chart is no longer supported by CockroachDB. However, it's intended to be used only for development and it's the easiest to set up.
resource "helm_release" "cockroachdb" {
depends_on = [null_resource.daemons]
Expand All @@ -46,7 +38,7 @@ resource "helm_release" "cockroachdb" {
statefulset = {
replicas = local.service_cockroachdb.count

priorityClassName = kubernetes_priority_class.cockroachdb_priority.metadata.0.name
priorityClassName = kubernetes_priority_class.stateful_priority.metadata.0.name

resources = var.limit_resources ? {
limits = {
Expand Down
12 changes: 1 addition & 11 deletions infra/tf/k8s_infra/imagor.tf
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,6 @@ resource "kubernetes_namespace" "imagor" {
}
}

resource "kubernetes_priority_class" "imagor_priority" {
count = var.imagor_enabled ? 1 : 0

metadata {
name = "imagor-priority"
}

value = 35
}

resource "kubernetes_deployment" "imagor" {
count = var.imagor_enabled ? 1 : 0
depends_on = [null_resource.daemons, module.docker_auth]
Expand All @@ -92,7 +82,7 @@ resource "kubernetes_deployment" "imagor" {
}

spec {
priority_class_name = kubernetes_priority_class.imagor_priority.0.metadata.0.name
priority_class_name = kubernetes_priority_class.service_priority.metadata.0.name

# MARK: Docker auth
image_pull_secrets {
Expand Down
12 changes: 1 addition & 11 deletions infra/tf/k8s_infra/loki.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,6 @@ resource "kubernetes_namespace" "loki" {
}
}

resource "kubernetes_priority_class" "loki_priority" {
count = var.prometheus_enabled ? 1 : 0

metadata {
name = "loki-priority"
}

value = 40
}

resource "helm_release" "loki" {
count = var.prometheus_enabled ? 1 : 0

Expand All @@ -36,7 +26,7 @@ resource "helm_release" "loki" {
version = "5.36.0"
values = [yamlencode({
global = {
priorityClassName = kubernetes_priority_class.loki_priority.0.metadata.0.name
priorityClassName = kubernetes_priority_class.monitoring_priority.metadata.0.name
}
loki = {
auth_enabled = false
Expand Down
10 changes: 1 addition & 9 deletions infra/tf/k8s_infra/minio.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,6 @@ module "minio_secrets" {
optional = true
}

resource "kubernetes_priority_class" "minio_priority" {
metadata {
name = "minio-priority"
}

value = 40
}

resource "helm_release" "minio" {
depends_on = [null_resource.daemons]
count = local.has_minio ? 1 : 0
Expand All @@ -48,7 +40,7 @@ resource "helm_release" "minio" {
storageClass = var.k8s_storage_class
}
replicaCount = local.service_minio.count
priorityClassName = kubernetes_priority_class.minio_priority.metadata.0.name
priorityClassName = kubernetes_priority_class.service_priority.metadata.0.name
resources = var.limit_resources ? {
limits = {
memory = "${local.service_minio.resources.memory}Mi"
Expand Down
14 changes: 1 addition & 13 deletions infra/tf/k8s_infra/nats.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,7 @@ resource "kubernetes_namespace" "nats" {
}
}

resource "kubernetes_priority_class" "nats_priority" {
metadata {
name = "nats-priority"
}

value = 40
}

# TODO(RVTEE-105): Fix priority class
resource "helm_release" "nats" {
depends_on = [null_resource.daemons]

Expand All @@ -37,11 +30,6 @@ resource "helm_release" "nats" {
replicas = local.service_nats.count
}
}
podTemplate = {
merge = {
priorityClassName = kubernetes_priority_class.nats_priority.metadata.0.name
}
}
container = {
env = {
# See https://artifacthub.io/packages/helm/grafana/grafana#nats-container-resources
Expand Down
12 changes: 1 addition & 11 deletions infra/tf/k8s_infra/nomad.tf
Original file line number Diff line number Diff line change
Expand Up @@ -190,16 +190,6 @@ resource "kubectl_manifest" "nomad_server_monitor" {
})
}

resource "kubernetes_priority_class" "nomad_priority" {
count = var.edge_enabled ? 1 : 0

metadata {
name = "nomad-priority"
}

value = 40
}

resource "kubernetes_stateful_set" "nomad_server" {
count = var.edge_enabled ? 1 : 0
depends_on = [null_resource.daemons]
Expand Down Expand Up @@ -234,7 +224,7 @@ resource "kubernetes_stateful_set" "nomad_server" {
}

spec {
priority_class_name = kubernetes_priority_class.nomad_priority.0.metadata.0.name
priority_class_name = kubernetes_priority_class.stateful_priority.metadata.0.name

security_context {
run_as_user = 0
Expand Down
12 changes: 1 addition & 11 deletions infra/tf/k8s_infra/nsfw_api.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,6 @@ resource "kubernetes_namespace" "nsfw_api" {
}
}

resource "kubernetes_priority_class" "nsfw_api_priority" {
count = var.nsfw_api_enabled ? 1 : 0

metadata {
name = "nsfw-api-priority"
}

value = 40
}

resource "kubernetes_deployment" "nsfw_api" {
count = var.nsfw_api_enabled ? 1 : 0
depends_on = [null_resource.daemons, module.docker_auth]
Expand All @@ -52,7 +42,7 @@ resource "kubernetes_deployment" "nsfw_api" {
}

spec {
priority_class_name = kubernetes_priority_class.nsfw_api_priority.0.metadata.0.name
priority_class_name = kubernetes_priority_class.service_priority.metadata.0.name

# MARK: Docker auth
image_pull_secrets {
Expand Down
41 changes: 41 additions & 0 deletions infra/tf/k8s_infra/priority_class.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Used for services that can be preempted often.
#
# We set almost everything to this class (even crucial infrastrcuture) because
# we need to wait for Karpenter to boot a new node instead of shutting down
# existing services.
resource "kubernetes_priority_class" "service_priority" {

metadata {
name = "service-priority"
}
value = 50
}

# Used for anything required to monitor the other services. These should take
# priority no matter what in order to ensure we have visibility on what's going
# on.
resource "kubernetes_priority_class" "monitoring_priority" {

metadata {
name = "monitoring-priority"
}
value = 50
}

# Used for anything stateful that should not be frequently preempted.
resource "kubernetes_priority_class" "stateful_priority" {

metadata {
name = "stateful-priority"
}
value = 60
}

# Used for daemons that run on the machines and need to be scheduled no matter what.
resource "kubernetes_priority_class" "daemon_priority" {

metadata {
name = "daemon-priority"
}
value = 90
}
23 changes: 2 additions & 21 deletions infra/tf/k8s_infra/prometheus.tf
Original file line number Diff line number Diff line change
Expand Up @@ -86,25 +86,6 @@ resource "kubernetes_namespace" "prometheus" {
}
}

# Set a high priority for Node Exporter so it can run on all nodes
resource "kubernetes_priority_class" "node_exporter_priority" {
count = var.prometheus_enabled ? 1 : 0

metadata {
name = "node-exporter-priority"
}
value = 90
}

resource "kubernetes_priority_class" "prometheus_priority" {
count = var.prometheus_enabled ? 1 : 0

metadata {
name = "prometheus-priority"
}
value = 40
}

resource "helm_release" "prometheus" {
count = var.prometheus_enabled ? 1 : 0
depends_on = [helm_release.vpa]
Expand All @@ -122,7 +103,7 @@ resource "helm_release" "prometheus" {
cpu = "${local.service_node_exporter.resources.cpu}m"
}
} : null
priorityClassName = kubernetes_priority_class.node_exporter_priority.0.metadata.0.name
priorityClassName = kubernetes_priority_class.daemon_priority.metadata.0.name
affinity = {
nodeAffinity = {
requiredDuringSchedulingIgnoredDuringExecution = {
Expand Down Expand Up @@ -270,7 +251,7 @@ resource "helm_release" "prometheus" {
}
}

priorityClassName = kubernetes_priority_class.prometheus_priority.0.metadata.0.name
priorityClassName = kubernetes_priority_class.monitoring_priority.metadata.0.name
resources = var.limit_resources ? {
limits = {
memory = "${local.service_prometheus.resources.memory}Mi"
Expand Down
Loading

0 comments on commit 7b17295

Please sign in to comment.