diff --git a/docs/docs/concepts/vertical-pod-autoscaler.md b/docs/docs/concepts/vertical-pod-autoscaler.md index af934015..9d6a5713 100644 --- a/docs/docs/concepts/vertical-pod-autoscaler.md +++ b/docs/docs/concepts/vertical-pod-autoscaler.md @@ -144,6 +144,30 @@ vpa: controlledResources: ["cpu", "memory"] # Specify which resources to manage ``` +### **Controlled Values** + +By default VPA rewrites both `requests` and `limits` at admission, scaling the limit proportionally with the request. For workloads whose limits are sized to absorb cold-start CPU bursts (Django/gunicorn, Node SSR, JVM warmup), a low `minAllowed.cpu` paired with the default behaviour can shrink the CPU limit below what the cold-start path needs, causing startup-probe failures and SIGKILLs. + +Set `controlledValues: "RequestsOnly"` to tell VPA to only rewrite `requests` and leave the deployment template's `limits` untouched. The deployment then keeps its full cold-start headroom while VPA still right-sizes the steady-state request. + +```yaml +vpa: + enabled: true + updateMode: "Auto" + minAllowed: + cpu: "50m" # safe at this floor when limit is preserved + memory: "64Mi" + maxAllowed: + cpu: "2" + memory: "4Gi" + controlledResources: ["cpu", "memory"] + controlledValues: "RequestsOnly" # leave deployment-template limits alone +``` + +Valid values: +- `RequestsAndLimits` (default) — VPA scales both. Equivalent to omitting the field. +- `RequestsOnly` — VPA scales only `requests`; `limits` stay at the values in the underlying deployment template. + ## **VPA Best Practices** ### **1. Environment-Specific Configuration** diff --git a/pkg/clouds/pulumi/create_stack.go b/pkg/clouds/pulumi/create_stack.go index 43f8d0a9..f6895f40 100644 --- a/pkg/clouds/pulumi/create_stack.go +++ b/pkg/clouds/pulumi/create_stack.go @@ -2,10 +2,12 @@ package pulumi import ( "context" + "strings" "github.com/pkg/errors" "github.com/pulumi/pulumi/pkg/v3/backend" + "gocloud.dev/gcerrors" "github.com/simple-container-com/api/pkg/api" ) @@ -34,9 +36,101 @@ func (p *pulumi) selectStack(ctx context.Context, cfg *api.ConfigFile, stack api if err != nil { return nil, err } - if s, err := p.backend.GetStack(ctx, p.stackRef); err != nil { + s, err := p.backend.GetStack(ctx, p.stackRef) + if err != nil { + // Treat "checkpoint blob not found" as "stack does not exist". + // + // Pulumi's diy backend (pkg/v3/backend/diy) is supposed to map a + // missing checkpoint to (nil, nil) from GetStack — its own + // errCheckpointNotFound sentinel handles that. But the path runs + // through gocloud.dev/blob.Bucket.Exists, which only converts + // provider errors to (false, nil) when gcerrors.Code(err) == + // gcerrors.NotFound. + // + // Recent transitive bumps to cloud.google.com/go/storage (and the + // equivalent S3/Azure clients) sometimes surface a 404 through an + // error path that gocloud no longer classifies as NotFound — the + // error reaches Exists as code=Unknown, Exists returns (false, + // wrapped-err) instead of (false, nil), stackExists wraps that + // as "failed to load checkpoint", and GetStack returns the wrap + // rather than the (nil, nil) "missing stack" contract that the + // rest of SC's createStackIfNotExists / selectStack callers + // depend on. + // + // This affected external SC consumers on 2026.5.31 (e.g. the + // wize-rooms-api deploy on 2026-05-21) with: + // failed to get parent stack "wize-rooms-api": + // failed to get stack "wize-rooms-api": + // failed to load checkpoint: blob (key ".pulumi/stacks//.json") + // (code=Unknown): storage: object doesn't exist: + // googleapi: Error 404: No such object: ... + // + // Restore the v3.184-era contract here: if the underlying error + // is a NotFound (either by gocloud code or by the layered string + // pattern that surfaces from current GCS/S3 clients), treat + // GetStack as having returned (nil, nil) — the caller will then + // CreateStack as it did before the regression. + if stackCheckpointNotFound(err) { + return nil, nil + } return s, errors.Wrapf(err, "failed to get stack %q", p.stackRef) - } else { - return s, nil } + return s, nil +} + +// stackCheckpointNotFound returns true when err coming back from the diy +// backend's GetStack indicates that the underlying checkpoint blob is +// missing — i.e. the stack does not yet exist in state storage. +// +// First check is the structured one: gocloud's gcerrors.Code. That's what +// blob.Bucket.Exists uses internally to convert to (false, nil), and when +// it works we never hit this function in the first place — the structured +// path is the happy case we're patching around. +// +// Second check is a string match on the wrapped error message. We use +// it only as a fallback for the case where the underlying provider client +// (GCS / S3 / Azure) wraps the 404 in a way that gcerrors no longer sees +// as NotFound. We deliberately scope the match to error chains that +// originated in Pulumi's "failed to load checkpoint:" wrapper so we don't +// accidentally swallow unrelated NotFound-shaped errors from elsewhere +// in the deploy program. +func stackCheckpointNotFound(err error) bool { + if err == nil { + return false + } + if gcerrors.Code(err) == gcerrors.NotFound { + return true + } + msg := err.Error() + if !strings.Contains(msg, "failed to load checkpoint") { + return false + } + // Provider-specific 404 markers that gcerrors.Code may miss after a + // transitive bump. Match case-insensitively to defend against + // formatting drift across client versions ("NotFound" vs "notFound", + // "Not Found" with space, etc.): + // - GCS: "object doesn't exist" / "notFound" / "Error 404" + // - S3 v1: "NoSuchKey" + // - S3 v2: "api error NotFound" / "StatusCode: 404" + // - Azure: "BlobNotFound" / "ResourceNotFound" + // + // The "404" suffix is intentional and load-bearing: it's the HTTP + // status code that virtually every cloud-storage provider includes + // in the wrapped error for a missing object, regardless of the + // SDK's NotFound enum naming. + msgLower := strings.ToLower(msg) + for _, marker := range []string{ + "object doesn't exist", + "notfound", + "nosuchkey", + "blobnotfound", + "resourcenotfound", + "statuscode: 404", + "error 404", + } { + if strings.Contains(msgLower, marker) { + return true + } + } + return false } diff --git a/pkg/clouds/pulumi/create_stack_test.go b/pkg/clouds/pulumi/create_stack_test.go new file mode 100644 index 00000000..b0d884e2 --- /dev/null +++ b/pkg/clouds/pulumi/create_stack_test.go @@ -0,0 +1,84 @@ +package pulumi + +import ( + "errors" + "fmt" + "testing" +) + +// Note: we deliberately don't test the gcerrors.Code() == NotFound branch +// here. Constructing a `*gcerr.Error` requires gocloud.dev/internal/gcerr +// which is an internal package; the gcerrors.Code lookup uses errors.As on +// that concrete type. Functional coverage of that branch comes from gocloud +// itself; what's regressed and needs unit coverage is the string-fallback +// path that the customer's deploy actually hit. + +func TestStackCheckpointNotFound(t *testing.T) { + tests := []struct { + name string + err error + want bool + }{ + { + name: "nil", + err: nil, + want: false, + }, + { + name: "GCS 404 wrapped through Pulumi diy backend (the customer regression case)", + err: fmt.Errorf("failed to load checkpoint: %w", + errors.New(`blob (key ".pulumi/stacks/demo/wize-rooms-api.json") (code=Unknown): storage: object doesn't exist: googleapi: Error 404: No such object: likeclaw-simple-container-state/.pulumi/stacks/demo/wize-rooms-api.json, notFound`)), + want: true, + }, + { + name: "GCS 404 without the 'failed to load checkpoint' prefix — out of scope, don't swallow", + err: errors.New(`storage: object doesn't exist: googleapi: Error 404`), + want: false, + }, + { + name: "S3 v1 NoSuchKey wrapped through Pulumi diy backend", + err: fmt.Errorf("failed to load checkpoint: %w", + errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): NoSuchKey: The specified key does not exist`)), + want: true, + }, + { + name: "S3 v2 SDK 'api error NotFound' wrapped through Pulumi diy backend", + err: fmt.Errorf("failed to load checkpoint: %w", + errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): operation error S3: HeadObject, https response error StatusCode: 404, RequestID: x, HostID: y, api error NotFound: Not Found`)), + want: true, + }, + { + name: "Azure BlobNotFound wrapped through Pulumi diy backend", + err: fmt.Errorf("failed to load checkpoint: %w", + errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): BlobNotFound`)), + want: true, + }, + { + name: "GCS NotFound with capitalized 'Not Found' (case-insensitivity guard)", + err: fmt.Errorf("failed to load checkpoint: %w", + errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): NotFound: object Not Found`)), + want: true, + }, + { + name: "Generic 'StatusCode: 404' wrap (covers future client SDKs we don't enumerate)", + err: fmt.Errorf("failed to load checkpoint: %w", + errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): StatusCode: 404`)), + want: true, + }, + { + name: "unrelated error containing 'failed to load checkpoint' but no NotFound marker", + err: fmt.Errorf("failed to load checkpoint: %w", + errors.New("permission denied: 403")), + want: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := stackCheckpointNotFound(tc.err) + if got != tc.want { + t.Errorf("stackCheckpointNotFound(%q) = %v, want %v", tc.err, got, tc.want) + } + }) + } +}