From 1856131b95cea4f7d14979d7425a650c75919824 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Fri, 10 Apr 2026 18:37:58 +0400 Subject: [PATCH 1/4] feat: add preStop drain and terminationGracePeriodSeconds support to Caddy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During rolling updates, Caddy pods receive SIGTERM while Cloudflare still holds persistent connections, causing 521 errors. Add two new CaddyConfig fields to allow operators to configure graceful drain: - preStopSleepSeconds: injects a preStop exec sleep on all containers so load-balancer endpoint propagation completes before SIGTERM is sent - terminationGracePeriodSeconds: pod-level override to ensure the grace period is long enough to cover the preStop sleep + Caddy shutdown Both fields are wired through Args → SimpleContainerArgs → PodSpec and the Lifecycle hook respectively. --- pkg/clouds/k8s/types.go | 7 +++++ pkg/clouds/pulumi/kubernetes/caddy.go | 26 ++++++++++--------- pkg/clouds/pulumi/kubernetes/deployment.go | 20 ++++++++++++-- .../pulumi/kubernetes/simple_container.go | 13 +++++++--- 4 files changed, 49 insertions(+), 17 deletions(-) diff --git a/pkg/clouds/k8s/types.go b/pkg/clouds/k8s/types.go index 0009e492..fe4c1479 100644 --- a/pkg/clouds/k8s/types.go +++ b/pkg/clouds/k8s/types.go @@ -50,6 +50,13 @@ type CaddyConfig struct { UseSSL *bool `json:"useSSL,omitempty" yaml:"useSSL,omitempty"` // whether to use ssl by default (default: true) // Deployment name override for existing Caddy deployments (used when adopting clusters) DeploymentName *string `json:"deploymentName,omitempty" yaml:"deploymentName,omitempty"` // override deployment name when adopting existing Caddy + // TerminationGracePeriodSeconds overrides the pod-level terminationGracePeriodSeconds for Caddy. + // Should be greater than preStopSleepSeconds. Default: Kubernetes default (30s). + TerminationGracePeriodSeconds *int `json:"terminationGracePeriodSeconds,omitempty" yaml:"terminationGracePeriodSeconds,omitempty"` + // PreStopSleepSeconds inserts a preStop exec sleep before SIGTERM is sent to Caddy. + // Allows load-balancer endpoint propagation and in-flight connection drain before shutdown. + // Prevents Cloudflare 521 errors during rolling updates. Default: 0 (disabled). + PreStopSleepSeconds *int `json:"preStopSleepSeconds,omitempty" yaml:"preStopSleepSeconds,omitempty"` } type DisruptionBudget struct { diff --git a/pkg/clouds/pulumi/kubernetes/caddy.go b/pkg/clouds/pulumi/kubernetes/caddy.go index 60cab6b6..ff8807d4 100644 --- a/pkg/clouds/pulumi/kubernetes/caddy.go +++ b/pkg/clouds/pulumi/kubernetes/caddy.go @@ -243,18 +243,20 @@ func DeployCaddyService(ctx *sdk.Context, caddy CaddyDeployment, input api.Resou } sc, err := DeploySimpleContainer(ctx, Args{ - ServiceType: serviceType, // to provision external IP - ProvisionIngress: caddy.ProvisionIngress, - UseSSL: useSSL, - Namespace: namespace, - DeploymentName: deploymentName, - Input: input, - ServiceAccountName: lo.ToPtr(serviceAccount.Name), - Deployment: deploymentConfig, - SecretVolumes: caddy.SecretVolumes, // Cloud credentials volumes (e.g., GCP service account) - SecretVolumeOutputs: caddy.SecretVolumeOutputs, // Pulumi outputs for secret volumes - SecretEnvs: secretEnvs, // Secret environment variables - VPA: caddy.VPA, // Vertical Pod Autoscaler configuration for Caddy + ServiceType: serviceType, // to provision external IP + ProvisionIngress: caddy.ProvisionIngress, + UseSSL: useSSL, + Namespace: namespace, + DeploymentName: deploymentName, + Input: input, + ServiceAccountName: lo.ToPtr(serviceAccount.Name), + Deployment: deploymentConfig, + SecretVolumes: caddy.SecretVolumes, // Cloud credentials volumes (e.g., GCP service account) + SecretVolumeOutputs: caddy.SecretVolumeOutputs, // Pulumi outputs for secret volumes + SecretEnvs: secretEnvs, // Secret environment variables + VPA: caddy.VPA, // Vertical Pod Autoscaler configuration for Caddy + TerminationGracePeriodSeconds: lo.FromPtr(caddy.CaddyConfig).TerminationGracePeriodSeconds, + PreStopSleepSeconds: lo.FromPtr(caddy.CaddyConfig).PreStopSleepSeconds, Images: []*ContainerImage{ { Container: caddyContainer, diff --git a/pkg/clouds/pulumi/kubernetes/deployment.go b/pkg/clouds/pulumi/kubernetes/deployment.go index ca792c32..249de581 100644 --- a/pkg/clouds/pulumi/kubernetes/deployment.go +++ b/pkg/clouds/pulumi/kubernetes/deployment.go @@ -46,6 +46,10 @@ type Args struct { ReadinessProbe *k8s.CloudRunProbe // Global readiness probe configuration LivenessProbe *k8s.CloudRunProbe // Global liveness probe configuration EphemeralSize string + // TerminationGracePeriodSeconds overrides pod-level terminationGracePeriodSeconds. + TerminationGracePeriodSeconds *int + // PreStopSleepSeconds injects a preStop exec sleep on all containers, allowing LB drain before SIGTERM. + PreStopSleepSeconds *int } func DeploySimpleContainer(ctx *sdk.Context, args Args, opts ...sdk.ResourceOption) (*SimpleContainer, error) { @@ -181,13 +185,24 @@ func DeploySimpleContainer(ctx *sdk.Context, args Args, opts ...sdk.ResourceOpti resources.Requests = sdk.ToStringMap(c.Container.Resources.Requests) } + var lifecycle *corev1.LifecycleArgs + if args.PreStopSleepSeconds != nil && *args.PreStopSleepSeconds > 0 { + lifecycle = &corev1.LifecycleArgs{ + PreStop: &corev1.LifecycleHandlerArgs{ + Exec: &corev1.ExecActionArgs{ + Command: sdk.ToStringArray([]string{"sleep", fmt.Sprintf("%d", *args.PreStopSleepSeconds)}), + }, + }, + } + } + return corev1.ContainerArgs{ Args: sdk.ToStringArray(c.Container.Args), Command: sdk.ToStringArray(c.Container.Command), Env: env, Image: c.ImageName, ImagePullPolicy: sdk.String(lo.If(c.Container.ImagePullPolicy != nil, lo.FromPtr(c.Container.ImagePullPolicy)).Else("IfNotPresent")), - Lifecycle: nil, // TODO + Lifecycle: lifecycle, LivenessProbe: livenessProbe, Name: sdk.String(c.Container.Name), Ports: ports, @@ -250,7 +265,8 @@ func DeploySimpleContainer(ctx *sdk.Context, args Args, opts ...sdk.ResourceOpti SecretVolumes: args.SecretVolumes, SecretVolumeOutputs: args.SecretVolumeOutputs, ImagePullSecret: args.ImagePullSecret, - EphemeralSize: args.EphemeralSize, + EphemeralSize: args.EphemeralSize, + TerminationGracePeriodSeconds: args.TerminationGracePeriodSeconds, }, opts...) if err != nil { return nil, errors.Wrapf(err, "failed to provision simple container for stack %q in %q", stackName, args.Input.StackParams.Environment) diff --git a/pkg/clouds/pulumi/kubernetes/simple_container.go b/pkg/clouds/pulumi/kubernetes/simple_container.go index 09cab279..3b9a9433 100644 --- a/pkg/clouds/pulumi/kubernetes/simple_container.go +++ b/pkg/clouds/pulumi/kubernetes/simple_container.go @@ -138,7 +138,8 @@ type SimpleContainerArgs struct { ComputeContext pApi.ComputeContext ImagePullSecret *docker.RegistryCredentials UseSSL bool - EphemeralSize string + EphemeralSize string + TerminationGracePeriodSeconds *int } type SimpleContainer struct { @@ -517,8 +518,14 @@ func NewSimpleContainer(ctx *sdk.Context, args *SimpleContainerArgs, opts ...sdk args.Log.Info(ctx.Context(), "🔍 DEBUG: Converted affinity result: %+v", convertedAffinity) podSpecArgs := &corev1.PodSpecArgs{ - NodeSelector: sdk.ToStringMap(args.NodeSelector), - Affinity: convertedAffinity, + NodeSelector: sdk.ToStringMap(args.NodeSelector), + Affinity: convertedAffinity, + TerminationGracePeriodSeconds: func() sdk.IntPtrInput { + if args.TerminationGracePeriodSeconds != nil { + return sdk.IntPtr(*args.TerminationGracePeriodSeconds) + } + return nil + }(), InitContainers: sdk.All(initContainerOutputs...).ApplyT(func(scOuts []any) (corev1.ContainerArray, error) { for _, c := range scOuts { initContainers = append(initContainers, c.(corev1.ContainerInput)) From a9f535f3c805e268bd5c11f80f81af946f568652 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Fri, 10 Apr 2026 20:06:41 +0400 Subject: [PATCH 2/4] fix: make caddy-updated-at content-driven to prevent spurious rolling restarts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit time.Now() was used at pulumi eval time, so caddy-updated-at always changed on every pulumi up even when the Caddyfile was identical. This dirtied the pod template on every app deployment, causing a Caddy rolling restart each time — which triggered Cloudflare 521 errors due to persistent connections being dropped before Cloudflare rerouted them. History: the original value was the static string "latest" (PR #59 changed it to time.Now() as an "improvement"). The intent was informational — not a rollout trigger. Fix: derive caddy-updated-at from the Caddyfile content hash (same source as caddy-update-hash). The annotation value is now stable across pulumi ups when the Caddyfile hasn't changed, so K8s sees no pod template diff → no rollout. Caddy still rolls when the Caddyfile actually changes (different hash). Confirmed root cause via GCP Cloud Logging: all three Caddy patch events on 2026-04-10 had identical hash (03709a04d391d8ac) but different timestamps, proving time.Now() was the sole cause of every rollout. --- pkg/clouds/pulumi/gcp/gke_autopilot_stack.go | 11 +++++++++-- pkg/clouds/pulumi/kubernetes/kube_run.go | 11 +++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/pkg/clouds/pulumi/gcp/gke_autopilot_stack.go b/pkg/clouds/pulumi/gcp/gke_autopilot_stack.go index f35393be..d3cf2039 100644 --- a/pkg/clouds/pulumi/gcp/gke_autopilot_stack.go +++ b/pkg/clouds/pulumi/gcp/gke_autopilot_stack.go @@ -9,7 +9,6 @@ import ( "os" "os/exec" "strings" - "time" auth "golang.org/x/oauth2/google" @@ -235,7 +234,15 @@ func GkeAutopilotStack(ctx *sdk.Context, stack api.Stack, input api.ResourceInpu Kubeconfig: &kubeConfigOutput, Annotations: map[string]sdk.StringOutput{ "simple-container.com/caddy-updated-by": sdk.String(stackName).ToStringOutput(), - "simple-container.com/caddy-updated-at": sdk.String(time.Now().UTC().Format(time.RFC3339)).ToStringOutput(), + // caddy-updated-at is derived from the Caddyfile hash, NOT from time.Now(). + // Using time.Now() at pulumi eval time would dirty the pod template on every + // pulumi up even when the Caddyfile didn't change, causing spurious Caddy rolling + // restarts and downstream Cloudflare 521 errors. The value here is informational + // (shows which hash revision was last deployed) rather than a wall-clock timestamp. + "simple-container.com/caddy-updated-at": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { + sum := md5.Sum([]byte(entry[0].(string))) + return hex.EncodeToString(sum[:])[:8] // short prefix — readable, stable, content-driven + }).(sdk.StringOutput), "simple-container.com/caddy-update-hash": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { sum := md5.Sum([]byte(entry[0].(string))) return hex.EncodeToString(sum[:]) diff --git a/pkg/clouds/pulumi/kubernetes/kube_run.go b/pkg/clouds/pulumi/kubernetes/kube_run.go index e7f2d92d..e629c7da 100644 --- a/pkg/clouds/pulumi/kubernetes/kube_run.go +++ b/pkg/clouds/pulumi/kubernetes/kube_run.go @@ -5,7 +5,6 @@ import ( "encoding/hex" "encoding/json" "fmt" - "time" "github.com/pkg/errors" "github.com/samber/lo" @@ -216,7 +215,15 @@ func KubeRun(ctx *sdk.Context, stack api.Stack, input api.ResourceInput, params Kubeconfig: &kubeconfigOutput, Annotations: map[string]sdk.StringOutput{ "simple-container.com/caddy-updated-by": sdk.String(stackName).ToStringOutput(), - "simple-container.com/caddy-updated-at": sdk.String(time.Now().UTC().Format(time.RFC3339)).ToStringOutput(), + // caddy-updated-at is derived from the Caddyfile hash, NOT from time.Now(). + // Using time.Now() at pulumi eval time would dirty the pod template on every + // pulumi up even when the Caddyfile didn't change, causing spurious Caddy rolling + // restarts and downstream Cloudflare 521 errors. The value here is informational + // (shows which hash revision was last deployed) rather than a wall-clock timestamp. + "simple-container.com/caddy-updated-at": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { + sum := md5.Sum([]byte(entry[0].(string))) + return hex.EncodeToString(sum[:])[:8] // short prefix — readable, stable, content-driven + }).(sdk.StringOutput), "simple-container.com/caddy-update-hash": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { sum := md5.Sum([]byte(entry[0].(string))) return hex.EncodeToString(sum[:]) From 0ed4f5c957ef6d5c3a5ec89678a0037f1a418e88 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Fri, 10 Apr 2026 20:34:49 +0400 Subject: [PATCH 3/4] fix: move Caddy informational annotations to deployment metadata to prevent spurious pod restarts Root cause: caddy-updated-at/caddy-updated-by were patched into spec.template.metadata.annotations which triggers a rolling restart on every change. Combined with time.Now() being evaluated on every pulumi up, this caused Caddy to roll on every app deploy, producing Cloudflare 521 errors while the old pod was terminating. Changes: - DeploymentPatchArgs gains DeploymentAnnotations for metadata-only patches (no pod restart) - caddy-updated-at and caddy-updated-by moved to DeploymentAnnotations in gke_autopilot_stack.go and kube_run.go; only caddy-update-hash (content-driven) remains in pod template annotations - Extract buildPodTemplatePatch/buildDeploymentMetadataPatch helpers for testability - Extract buildPreStopLifecycle helper for testability - Add unit tests: patch target isolation, preStop lifecycle injection (nil/zero/positive) - Fix gci import formatting in caddy.go, deployment.go, simple_container.go, deployment_patch.go --- pkg/clouds/pulumi/gcp/gke_autopilot_stack.go | 21 +-- pkg/clouds/pulumi/kubernetes/caddy.go | 8 +- pkg/clouds/pulumi/kubernetes/deployment.go | 40 +++--- .../pulumi/kubernetes/deployment_patch.go | 135 ++++++++++++------ .../kubernetes/deployment_patch_test.go | 121 ++++++++++++++++ pkg/clouds/pulumi/kubernetes/kube_run.go | 21 +-- .../pulumi/kubernetes/simple_container.go | 30 ++-- 7 files changed, 276 insertions(+), 100 deletions(-) create mode 100644 pkg/clouds/pulumi/kubernetes/deployment_patch_test.go diff --git a/pkg/clouds/pulumi/gcp/gke_autopilot_stack.go b/pkg/clouds/pulumi/gcp/gke_autopilot_stack.go index d3cf2039..033b862f 100644 --- a/pkg/clouds/pulumi/gcp/gke_autopilot_stack.go +++ b/pkg/clouds/pulumi/gcp/gke_autopilot_stack.go @@ -232,22 +232,23 @@ func GkeAutopilotStack(ctx *sdk.Context, stack api.Stack, input api.ResourceInpu Namespace: namespace, KubeProvider: kubeProvider, Kubeconfig: &kubeConfigOutput, + // caddy-update-hash goes into spec.template.metadata so Caddy pods roll only when + // the Caddyfile actually changes. Content-hash, not wall-clock time, prevents + // spurious restarts (and Cloudflare 521s) on every pulumi up. Annotations: map[string]sdk.StringOutput{ - "simple-container.com/caddy-updated-by": sdk.String(stackName).ToStringOutput(), - // caddy-updated-at is derived from the Caddyfile hash, NOT from time.Now(). - // Using time.Now() at pulumi eval time would dirty the pod template on every - // pulumi up even when the Caddyfile didn't change, causing spurious Caddy rolling - // restarts and downstream Cloudflare 521 errors. The value here is informational - // (shows which hash revision was last deployed) rather than a wall-clock timestamp. - "simple-container.com/caddy-updated-at": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { - sum := md5.Sum([]byte(entry[0].(string))) - return hex.EncodeToString(sum[:])[:8] // short prefix — readable, stable, content-driven - }).(sdk.StringOutput), "simple-container.com/caddy-update-hash": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { sum := md5.Sum([]byte(entry[0].(string))) return hex.EncodeToString(sum[:]) }).(sdk.StringOutput), }, + // Informational annotations live on deployment metadata only — no pod restarts. + DeploymentAnnotations: map[string]sdk.StringOutput{ + "simple-container.com/caddy-updated-by": sdk.String(stackName).ToStringOutput(), + "simple-container.com/caddy-updated-at": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { + sum := md5.Sum([]byte(entry[0].(string))) + return hex.EncodeToString(sum[:])[:8] + }).(sdk.StringOutput), + }, Opts: []sdk.ResourceOption{sdk.DependsOn([]sdk.Resource{sc.Service})}, }) if patchErr != nil { diff --git a/pkg/clouds/pulumi/kubernetes/caddy.go b/pkg/clouds/pulumi/kubernetes/caddy.go index ff8807d4..fd6a0312 100644 --- a/pkg/clouds/pulumi/kubernetes/caddy.go +++ b/pkg/clouds/pulumi/kubernetes/caddy.go @@ -251,10 +251,10 @@ func DeployCaddyService(ctx *sdk.Context, caddy CaddyDeployment, input api.Resou Input: input, ServiceAccountName: lo.ToPtr(serviceAccount.Name), Deployment: deploymentConfig, - SecretVolumes: caddy.SecretVolumes, // Cloud credentials volumes (e.g., GCP service account) - SecretVolumeOutputs: caddy.SecretVolumeOutputs, // Pulumi outputs for secret volumes - SecretEnvs: secretEnvs, // Secret environment variables - VPA: caddy.VPA, // Vertical Pod Autoscaler configuration for Caddy + SecretVolumes: caddy.SecretVolumes, // Cloud credentials volumes (e.g., GCP service account) + SecretVolumeOutputs: caddy.SecretVolumeOutputs, // Pulumi outputs for secret volumes + SecretEnvs: secretEnvs, // Secret environment variables + VPA: caddy.VPA, // Vertical Pod Autoscaler configuration for Caddy TerminationGracePeriodSeconds: lo.FromPtr(caddy.CaddyConfig).TerminationGracePeriodSeconds, PreStopSleepSeconds: lo.FromPtr(caddy.CaddyConfig).PreStopSleepSeconds, Images: []*ContainerImage{ diff --git a/pkg/clouds/pulumi/kubernetes/deployment.go b/pkg/clouds/pulumi/kubernetes/deployment.go index 249de581..41bdae65 100644 --- a/pkg/clouds/pulumi/kubernetes/deployment.go +++ b/pkg/clouds/pulumi/kubernetes/deployment.go @@ -185,16 +185,7 @@ func DeploySimpleContainer(ctx *sdk.Context, args Args, opts ...sdk.ResourceOpti resources.Requests = sdk.ToStringMap(c.Container.Resources.Requests) } - var lifecycle *corev1.LifecycleArgs - if args.PreStopSleepSeconds != nil && *args.PreStopSleepSeconds > 0 { - lifecycle = &corev1.LifecycleArgs{ - PreStop: &corev1.LifecycleHandlerArgs{ - Exec: &corev1.ExecActionArgs{ - Command: sdk.ToStringArray([]string{"sleep", fmt.Sprintf("%d", *args.PreStopSleepSeconds)}), - }, - }, - } - } + lifecycle := buildPreStopLifecycle(args.PreStopSleepSeconds) return corev1.ContainerArgs{ Args: sdk.ToStringArray(c.Container.Args), @@ -259,12 +250,12 @@ func DeploySimpleContainer(ctx *sdk.Context, args Args, opts ...sdk.ResourceOpti PodDisruption: lo.If(args.Deployment.DisruptionBudget != nil, args.Deployment.DisruptionBudget).Else(&k8s.DisruptionBudget{ MinAvailable: lo.ToPtr(1), }), - RollingUpdate: lo.If(args.Deployment.RollingUpdate != nil, toRollingUpdateArgs(args.Deployment.RollingUpdate)).Else(nil), - SecurityContext: nil, // TODO - Log: args.Params.Log, - SecretVolumes: args.SecretVolumes, - SecretVolumeOutputs: args.SecretVolumeOutputs, - ImagePullSecret: args.ImagePullSecret, + RollingUpdate: lo.If(args.Deployment.RollingUpdate != nil, toRollingUpdateArgs(args.Deployment.RollingUpdate)).Else(nil), + SecurityContext: nil, // TODO + Log: args.Params.Log, + SecretVolumes: args.SecretVolumes, + SecretVolumeOutputs: args.SecretVolumeOutputs, + ImagePullSecret: args.ImagePullSecret, EphemeralSize: args.EphemeralSize, TerminationGracePeriodSeconds: args.TerminationGracePeriodSeconds, }, opts...) @@ -289,6 +280,23 @@ func DeploySimpleContainer(ctx *sdk.Context, args Args, opts ...sdk.ResourceOpti return sc, nil } +// buildPreStopLifecycle returns a LifecycleArgs with an exec sleep preStop hook when +// preStopSleepSeconds is set and > 0. The sleep lets the load-balancer finish draining +// connections before the container receives SIGTERM, preventing 502/521 errors during +// rolling updates. +func buildPreStopLifecycle(preStopSleepSeconds *int) *corev1.LifecycleArgs { + if preStopSleepSeconds == nil || *preStopSleepSeconds <= 0 { + return nil + } + return &corev1.LifecycleArgs{ + PreStop: &corev1.LifecycleHandlerArgs{ + Exec: &corev1.ExecActionArgs{ + Command: sdk.ToStringArray([]string{"sleep", fmt.Sprintf("%d", *preStopSleepSeconds)}), + }, + }, + } +} + func toRollingUpdateArgs(update *k8s.RollingUpdate) *v1.RollingUpdateDeploymentArgs { return &v1.RollingUpdateDeploymentArgs{ MaxUnavailable: lo.If(lo.FromPtr(update).MaxUnavailable != nil, sdk.IntPtrFromPtr(lo.FromPtr(update).MaxUnavailable)).Else(nil), diff --git a/pkg/clouds/pulumi/kubernetes/deployment_patch.go b/pkg/clouds/pulumi/kubernetes/deployment_patch.go index 76af3e1a..bdf2a5aa 100644 --- a/pkg/clouds/pulumi/kubernetes/deployment_patch.go +++ b/pkg/clouds/pulumi/kubernetes/deployment_patch.go @@ -17,20 +17,50 @@ import ( ) type DeploymentPatchArgs struct { - PatchName string - ServiceName string - Namespace string - Annotations map[string]sdk.StringOutput - KubeProvider *sdkK8s.Provider // Main Kubernetes provider (for dependencies) - Kubeconfig *sdk.StringOutput // Optional: Kubeconfig for creating patch-specific provider - Opts []sdk.ResourceOption + PatchName string + ServiceName string + Namespace string + // Annotations are applied to spec.template.metadata — changes here trigger a pod rolling update. + // Use only for values that should restart pods when changed (e.g. content hashes). + Annotations map[string]sdk.StringOutput + // DeploymentAnnotations are applied to metadata only — changes do NOT trigger pod restarts. + // Use for informational labels (e.g. caddy-updated-at, caddy-updated-by). + DeploymentAnnotations map[string]sdk.StringOutput + KubeProvider *sdkK8s.Provider // Main Kubernetes provider (for dependencies) + Kubeconfig *sdk.StringOutput // Optional: Kubeconfig for creating patch-specific provider + Opts []sdk.ResourceOption } type deploymentPatchInputs struct { - Kubeconfig string - Namespace string - ServiceName string - Annotations map[string]string + Kubeconfig string + Namespace string + ServiceName string + Annotations map[string]string + DeploymentAnnotations map[string]string +} + +// buildPodTemplatePatch returns the JSON patch that targets spec.template.metadata.annotations. +// Changes here cause a rolling restart of pods. +func buildPodTemplatePatch(annotations map[string]string) ([]byte, error) { + return json.Marshal(map[string]interface{}{ + "spec": map[string]interface{}{ + "template": map[string]interface{}{ + "metadata": map[string]interface{}{ + "annotations": annotations, + }, + }, + }, + }) +} + +// buildDeploymentMetadataPatch returns the JSON patch that targets metadata.annotations. +// Changes here do NOT trigger pod restarts. +func buildDeploymentMetadataPatch(annotations map[string]string) ([]byte, error) { + return json.Marshal(map[string]interface{}{ + "metadata": map[string]interface{}{ + "annotations": annotations, + }, + }) } func patchDeploymentWithK8sClient(ctx context.Context, inputs deploymentPatchInputs) error { @@ -45,40 +75,48 @@ func patchDeploymentWithK8sClient(ctx context.Context, inputs deploymentPatchInp return fmt.Errorf("failed to create Kubernetes client: %w", err) } - // Build the patch payload - only the annotations we want to update - patch := map[string]interface{}{ - "spec": map[string]interface{}{ - "template": map[string]interface{}{ - "metadata": map[string]interface{}{ - "annotations": inputs.Annotations, - }, - }, - }, + patchOptions := metav1.PatchOptions{ + FieldManager: "simple-container", } - // Marshal to JSON - patchBytes, err := json.Marshal(patch) - if err != nil { - return fmt.Errorf("failed to marshal patch: %w", err) - } + // Patch spec.template.metadata.annotations — triggers rolling restart when values change. + if len(inputs.Annotations) > 0 { + patchBytes, err := buildPodTemplatePatch(inputs.Annotations) + if err != nil { + return fmt.Errorf("failed to marshal pod-template annotations patch: %w", err) + } - // Apply the patch using Strategic Merge Patch - // This is a true partial update that doesn't require full deployment spec - patchOptions := metav1.PatchOptions{ - FieldManager: "simple-container", + _, err = clientSet.AppsV1().Deployments(inputs.Namespace).Patch( + ctx, + inputs.ServiceName, + types.StrategicMergePatchType, + patchBytes, + patchOptions, + ) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "❌ PATCH ERROR: failed to patch deployment pod-template annotations %s/%s: %v\n", inputs.Namespace, inputs.ServiceName, err) + return fmt.Errorf("failed to patch deployment %s/%s: %w", inputs.Namespace, inputs.ServiceName, err) + } } - _, err = clientSet.AppsV1().Deployments(inputs.Namespace).Patch( - ctx, - inputs.ServiceName, - types.StrategicMergePatchType, - patchBytes, - patchOptions, - ) - if err != nil { - // Log detailed error information for debugging - _, _ = fmt.Fprintf(os.Stderr, "❌ PATCH ERROR: failed to patch deployment %s/%s: %v\n", inputs.Namespace, inputs.ServiceName, err) - return fmt.Errorf("failed to patch deployment %s/%s: %w", inputs.Namespace, inputs.ServiceName, err) + // Patch metadata.annotations — informational only, does NOT trigger pod restarts. + if len(inputs.DeploymentAnnotations) > 0 { + patchBytes, err := buildDeploymentMetadataPatch(inputs.DeploymentAnnotations) + if err != nil { + return fmt.Errorf("failed to marshal deployment annotations patch: %w", err) + } + + _, err = clientSet.AppsV1().Deployments(inputs.Namespace).Patch( + ctx, + inputs.ServiceName, + types.StrategicMergePatchType, + patchBytes, + patchOptions, + ) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "❌ PATCH ERROR: failed to patch deployment metadata annotations %s/%s: %v\n", inputs.Namespace, inputs.ServiceName, err) + return fmt.Errorf("failed to patch deployment metadata annotations %s/%s: %w", inputs.Namespace, inputs.ServiceName, err) + } } return nil @@ -90,10 +128,11 @@ func PatchDeployment(ctx *sdk.Context, args *DeploymentPatchArgs) (*sdk.StringOu // Convert map[string]StringOutput to StringMapOutput for proper resolution annotationsOutput := sdk.ToStringMapOutput(args.Annotations) + deploymentAnnotationsOutput := sdk.ToStringMapOutput(args.DeploymentAnnotations) // Apply the patch when all outputs are resolved // Use ApplyTWithContext to get access to Pulumi's context - result := sdk.All(args.Kubeconfig, annotationsOutput).ApplyTWithContext(ctx.Context(), func(goCtx context.Context, vals []interface{}) (string, error) { + result := sdk.All(args.Kubeconfig, annotationsOutput, deploymentAnnotationsOutput).ApplyTWithContext(ctx.Context(), func(goCtx context.Context, vals []interface{}) (string, error) { kubeconfigStr, ok := vals[0].(string) if !ok || kubeconfigStr == "" { return "", fmt.Errorf("kubeconfig is required for native Kubernetes client patching") @@ -104,11 +143,17 @@ func PatchDeployment(ctx *sdk.Context, args *DeploymentPatchArgs) (*sdk.StringOu return "", fmt.Errorf("failed to resolve annotations: got type %T", vals[1]) } + deploymentAnnotations, ok := vals[2].(map[string]string) + if !ok { + return "", fmt.Errorf("failed to resolve deployment annotations: got type %T", vals[2]) + } + inputs := deploymentPatchInputs{ - Kubeconfig: kubeconfigStr, - Namespace: args.Namespace, - ServiceName: args.ServiceName, - Annotations: annotations, + Kubeconfig: kubeconfigStr, + Namespace: args.Namespace, + ServiceName: args.ServiceName, + Annotations: annotations, + DeploymentAnnotations: deploymentAnnotations, } // Create a context that respects parent cancellation but allows extra time for patch to complete diff --git a/pkg/clouds/pulumi/kubernetes/deployment_patch_test.go b/pkg/clouds/pulumi/kubernetes/deployment_patch_test.go new file mode 100644 index 00000000..777ab045 --- /dev/null +++ b/pkg/clouds/pulumi/kubernetes/deployment_patch_test.go @@ -0,0 +1,121 @@ +package kubernetes + +import ( + "encoding/json" + "testing" + + . "github.com/onsi/gomega" + "github.com/samber/lo" +) + +// TestBuildPodTemplatePatch verifies the pod-template annotation patch targets +// spec.template.metadata, which triggers a rolling restart on change. +func TestBuildPodTemplatePatch(t *testing.T) { + RegisterTestingT(t) + + annotations := map[string]string{ + "simple-container.com/caddy-update-hash": "abc123", + } + + patchBytes, err := buildPodTemplatePatch(annotations) + Expect(err).ToNot(HaveOccurred()) + + var patch map[string]interface{} + Expect(json.Unmarshal(patchBytes, &patch)).To(Succeed()) + + // Must have spec.template.metadata.annotations path + spec, ok := patch["spec"].(map[string]interface{}) + Expect(ok).To(BeTrue(), "patch must have 'spec' key") + template, ok := spec["template"].(map[string]interface{}) + Expect(ok).To(BeTrue(), "spec must have 'template' key") + metadata, ok := template["metadata"].(map[string]interface{}) + Expect(ok).To(BeTrue(), "template must have 'metadata' key") + ann, ok := metadata["annotations"].(map[string]interface{}) + Expect(ok).To(BeTrue(), "metadata must have 'annotations' key") + Expect(ann["simple-container.com/caddy-update-hash"]).To(Equal("abc123")) + + // Must NOT have top-level metadata key (that would be the deployment, not pod template) + Expect(patch).ToNot(HaveKey("metadata")) +} + +// TestBuildDeploymentMetadataPatch verifies the deployment-level annotation patch targets +// metadata only (not spec.template), so it does NOT trigger pod restarts. +func TestBuildDeploymentMetadataPatch(t *testing.T) { + RegisterTestingT(t) + + annotations := map[string]string{ + "simple-container.com/caddy-updated-by": "my-stack", + "simple-container.com/caddy-updated-at": "deadbeef", + } + + patchBytes, err := buildDeploymentMetadataPatch(annotations) + Expect(err).ToNot(HaveOccurred()) + + var patch map[string]interface{} + Expect(json.Unmarshal(patchBytes, &patch)).To(Succeed()) + + // Must have top-level metadata.annotations + metadata, ok := patch["metadata"].(map[string]interface{}) + Expect(ok).To(BeTrue(), "patch must have 'metadata' key") + ann, ok := metadata["annotations"].(map[string]interface{}) + Expect(ok).To(BeTrue(), "metadata must have 'annotations' key") + Expect(ann["simple-container.com/caddy-updated-by"]).To(Equal("my-stack")) + Expect(ann["simple-container.com/caddy-updated-at"]).To(Equal("deadbeef")) + + // Must NOT touch spec.template (no rolling restart) + Expect(patch).ToNot(HaveKey("spec")) +} + +// TestPatchTargetsSeparation verifies the two patch helpers produce disjoint JSON structures, +// confirming that informational annotations cannot accidentally trigger pod restarts. +func TestPatchTargetsSeparation(t *testing.T) { + RegisterTestingT(t) + + podTemplateBytes, err := buildPodTemplatePatch(map[string]string{"k": "v"}) + Expect(err).ToNot(HaveOccurred()) + + deploymentBytes, err := buildDeploymentMetadataPatch(map[string]string{"k": "v"}) + Expect(err).ToNot(HaveOccurred()) + + var podPatch, deployPatch map[string]interface{} + Expect(json.Unmarshal(podTemplateBytes, &podPatch)).To(Succeed()) + Expect(json.Unmarshal(deploymentBytes, &deployPatch)).To(Succeed()) + + // Pod template patch must NOT have top-level metadata + Expect(podPatch).ToNot(HaveKey("metadata")) + // Deployment metadata patch must NOT have spec + Expect(deployPatch).ToNot(HaveKey("spec")) +} + +// TestBuildPreStopLifecycle verifies that preStop sleep injection works correctly. +func TestBuildPreStopLifecycle(t *testing.T) { + t.Run("nil preStopSleepSeconds returns nil lifecycle", func(t *testing.T) { + RegisterTestingT(t) + Expect(buildPreStopLifecycle(nil)).To(BeNil()) + }) + + t.Run("zero preStopSleepSeconds returns nil lifecycle", func(t *testing.T) { + RegisterTestingT(t) + Expect(buildPreStopLifecycle(lo.ToPtr(0))).To(BeNil()) + }) + + t.Run("negative preStopSleepSeconds returns nil lifecycle", func(t *testing.T) { + RegisterTestingT(t) + Expect(buildPreStopLifecycle(lo.ToPtr(-1))).To(BeNil()) + }) + + t.Run("positive preStopSleepSeconds injects exec sleep", func(t *testing.T) { + RegisterTestingT(t) + + lifecycle := buildPreStopLifecycle(lo.ToPtr(10)) + Expect(lifecycle).ToNot(BeNil()) + // PreStop is a PtrInput — verify the field is populated (non-nil interface) + Expect(lifecycle.PreStop).ToNot(BeNil()) + }) + + t.Run("preStopSleepSeconds 1 is accepted", func(t *testing.T) { + RegisterTestingT(t) + lifecycle := buildPreStopLifecycle(lo.ToPtr(1)) + Expect(lifecycle).ToNot(BeNil(), "smallest valid value should produce a lifecycle") + }) +} diff --git a/pkg/clouds/pulumi/kubernetes/kube_run.go b/pkg/clouds/pulumi/kubernetes/kube_run.go index e629c7da..200989f1 100644 --- a/pkg/clouds/pulumi/kubernetes/kube_run.go +++ b/pkg/clouds/pulumi/kubernetes/kube_run.go @@ -213,22 +213,23 @@ func KubeRun(ctx *sdk.Context, stack api.Stack, input api.ResourceInput, params Namespace: lo.If(caddyConfig.Namespace != nil, lo.FromPtr(caddyConfig.Namespace)).Else("caddy"), KubeProvider: kubeProvider, Kubeconfig: &kubeconfigOutput, + // caddy-update-hash goes into spec.template.metadata so Caddy pods roll only when + // the Caddyfile actually changes. Content-hash, not wall-clock time, prevents + // spurious restarts (and Cloudflare 521s) on every pulumi up. Annotations: map[string]sdk.StringOutput{ - "simple-container.com/caddy-updated-by": sdk.String(stackName).ToStringOutput(), - // caddy-updated-at is derived from the Caddyfile hash, NOT from time.Now(). - // Using time.Now() at pulumi eval time would dirty the pod template on every - // pulumi up even when the Caddyfile didn't change, causing spurious Caddy rolling - // restarts and downstream Cloudflare 521 errors. The value here is informational - // (shows which hash revision was last deployed) rather than a wall-clock timestamp. - "simple-container.com/caddy-updated-at": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { - sum := md5.Sum([]byte(entry[0].(string))) - return hex.EncodeToString(sum[:])[:8] // short prefix — readable, stable, content-driven - }).(sdk.StringOutput), "simple-container.com/caddy-update-hash": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { sum := md5.Sum([]byte(entry[0].(string))) return hex.EncodeToString(sum[:]) }).(sdk.StringOutput), }, + // Informational annotations live on deployment metadata only — no pod restarts. + DeploymentAnnotations: map[string]sdk.StringOutput{ + "simple-container.com/caddy-updated-by": sdk.String(stackName).ToStringOutput(), + "simple-container.com/caddy-updated-at": sdk.All(sc.CaddyfileEntry).ApplyT(func(entry []any) string { + sum := md5.Sum([]byte(entry[0].(string))) + return hex.EncodeToString(sum[:])[:8] + }).(sdk.StringOutput), + }, Opts: []sdk.ResourceOption{sdk.DependsOn([]sdk.Resource{sc.Service})}, }) if patchErr != nil { diff --git a/pkg/clouds/pulumi/kubernetes/simple_container.go b/pkg/clouds/pulumi/kubernetes/simple_container.go index 3b9a9433..cd04a9c4 100644 --- a/pkg/clouds/pulumi/kubernetes/simple_container.go +++ b/pkg/clouds/pulumi/kubernetes/simple_container.go @@ -125,19 +125,19 @@ type SimpleContainerArgs struct { Log logger.Logger // ... - RollingUpdate *v1.RollingUpdateDeploymentArgs - InitContainers []corev1.ContainerArgs - Containers []corev1.ContainerArgs - SecurityContext *corev1.PodSecurityContextArgs - ServiceAccountName *sdk.StringOutput - Sidecars []corev1.ContainerArgs - SidecarOutputs []corev1.ContainerOutput - InitContainerOutputs []corev1.ContainerOutput - VolumeOutputs []corev1.VolumeOutput - SecretVolumeOutputs []any - ComputeContext pApi.ComputeContext - ImagePullSecret *docker.RegistryCredentials - UseSSL bool + RollingUpdate *v1.RollingUpdateDeploymentArgs + InitContainers []corev1.ContainerArgs + Containers []corev1.ContainerArgs + SecurityContext *corev1.PodSecurityContextArgs + ServiceAccountName *sdk.StringOutput + Sidecars []corev1.ContainerArgs + SidecarOutputs []corev1.ContainerOutput + InitContainerOutputs []corev1.ContainerOutput + VolumeOutputs []corev1.VolumeOutput + SecretVolumeOutputs []any + ComputeContext pApi.ComputeContext + ImagePullSecret *docker.RegistryCredentials + UseSSL bool EphemeralSize string TerminationGracePeriodSeconds *int } @@ -518,8 +518,8 @@ func NewSimpleContainer(ctx *sdk.Context, args *SimpleContainerArgs, opts ...sdk args.Log.Info(ctx.Context(), "🔍 DEBUG: Converted affinity result: %+v", convertedAffinity) podSpecArgs := &corev1.PodSpecArgs{ - NodeSelector: sdk.ToStringMap(args.NodeSelector), - Affinity: convertedAffinity, + NodeSelector: sdk.ToStringMap(args.NodeSelector), + Affinity: convertedAffinity, TerminationGracePeriodSeconds: func() sdk.IntPtrInput { if args.TerminationGracePeriodSeconds != nil { return sdk.IntPtr(*args.TerminationGracePeriodSeconds) From 716aadca258eb788b188b4fdc5c7b1287f488e58 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Fri, 10 Apr 2026 22:27:03 +0400 Subject: [PATCH 4/4] fix: use absolute path in ENTRYPOINT to survive GHA workdir override --- github-actions.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/github-actions.Dockerfile b/github-actions.Dockerfile index e3375f98..3b869a8e 100644 --- a/github-actions.Dockerfile +++ b/github-actions.Dockerfile @@ -94,4 +94,4 @@ RUN pulumi version > /dev/null && \ gcloud components list --filter="name:gke-gcloud-auth-plugin" --format="value(name)" | grep -q gke-gcloud-auth-plugin # Set the entrypoint -ENTRYPOINT ["./github-actions"] +ENTRYPOINT ["/root/github-actions"]