Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions docs/docs/concepts/vertical-pod-autoscaler.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,30 @@ vpa:
controlledResources: ["cpu", "memory"] # Specify which resources to manage
```

### **Controlled Values**

By default VPA rewrites both `requests` and `limits` at admission, scaling the limit proportionally with the request. For workloads whose limits are sized to absorb cold-start CPU bursts (Django/gunicorn, Node SSR, JVM warmup), a low `minAllowed.cpu` paired with the default behaviour can shrink the CPU limit below what the cold-start path needs, causing startup-probe failures and SIGKILLs.

Set `controlledValues: "RequestsOnly"` to tell VPA to only rewrite `requests` and leave the deployment template's `limits` untouched. The deployment then keeps its full cold-start headroom while VPA still right-sizes the steady-state request.

```yaml
vpa:
enabled: true
updateMode: "Auto"
minAllowed:
cpu: "50m" # safe at this floor when limit is preserved
memory: "64Mi"
maxAllowed:
cpu: "2"
memory: "4Gi"
controlledResources: ["cpu", "memory"]
controlledValues: "RequestsOnly" # leave deployment-template limits alone
```

Valid values:
- `RequestsAndLimits` (default) — VPA scales both. Equivalent to omitting the field.
- `RequestsOnly` — VPA scales only `requests`; `limits` stay at the values in the underlying deployment template.

## **VPA Best Practices**

### **1. Environment-Specific Configuration**
Expand Down
100 changes: 97 additions & 3 deletions pkg/clouds/pulumi/create_stack.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ package pulumi

import (
"context"
"strings"

"github.com/pkg/errors"

"github.com/pulumi/pulumi/pkg/v3/backend"
"gocloud.dev/gcerrors"

"github.com/simple-container-com/api/pkg/api"
)
Expand Down Expand Up @@ -34,9 +36,101 @@ func (p *pulumi) selectStack(ctx context.Context, cfg *api.ConfigFile, stack api
if err != nil {
return nil, err
}
if s, err := p.backend.GetStack(ctx, p.stackRef); err != nil {
s, err := p.backend.GetStack(ctx, p.stackRef)
if err != nil {
// Treat "checkpoint blob not found" as "stack does not exist".
//
// Pulumi's diy backend (pkg/v3/backend/diy) is supposed to map a
// missing checkpoint to (nil, nil) from GetStack — its own
// errCheckpointNotFound sentinel handles that. But the path runs
// through gocloud.dev/blob.Bucket.Exists, which only converts
// provider errors to (false, nil) when gcerrors.Code(err) ==
// gcerrors.NotFound.
//
// Recent transitive bumps to cloud.google.com/go/storage (and the
// equivalent S3/Azure clients) sometimes surface a 404 through an
// error path that gocloud no longer classifies as NotFound — the
// error reaches Exists as code=Unknown, Exists returns (false,
// wrapped-err) instead of (false, nil), stackExists wraps that
// as "failed to load checkpoint", and GetStack returns the wrap
// rather than the (nil, nil) "missing stack" contract that the
// rest of SC's createStackIfNotExists / selectStack callers
// depend on.
//
// This affected external SC consumers on 2026.5.31 (e.g. the
// wize-rooms-api deploy on 2026-05-21) with:
// failed to get parent stack "wize-rooms-api":
// failed to get stack "wize-rooms-api":
// failed to load checkpoint: blob (key ".pulumi/stacks/<proj>/<stack>.json")
// (code=Unknown): storage: object doesn't exist:
// googleapi: Error 404: No such object: ...
//
// Restore the v3.184-era contract here: if the underlying error
// is a NotFound (either by gocloud code or by the layered string
// pattern that surfaces from current GCS/S3 clients), treat
// GetStack as having returned (nil, nil) — the caller will then
// CreateStack as it did before the regression.
if stackCheckpointNotFound(err) {
return nil, nil
}
return s, errors.Wrapf(err, "failed to get stack %q", p.stackRef)
} else {
return s, nil
}
return s, nil
}

// stackCheckpointNotFound returns true when err coming back from the diy
// backend's GetStack indicates that the underlying checkpoint blob is
// missing — i.e. the stack does not yet exist in state storage.
//
// First check is the structured one: gocloud's gcerrors.Code. That's what
// blob.Bucket.Exists uses internally to convert to (false, nil), and when
// it works we never hit this function in the first place — the structured
// path is the happy case we're patching around.
//
// Second check is a string match on the wrapped error message. We use
// it only as a fallback for the case where the underlying provider client
// (GCS / S3 / Azure) wraps the 404 in a way that gcerrors no longer sees
// as NotFound. We deliberately scope the match to error chains that
// originated in Pulumi's "failed to load checkpoint:" wrapper so we don't
// accidentally swallow unrelated NotFound-shaped errors from elsewhere
// in the deploy program.
func stackCheckpointNotFound(err error) bool {
if err == nil {
return false
}
if gcerrors.Code(err) == gcerrors.NotFound {
return true
}
msg := err.Error()
if !strings.Contains(msg, "failed to load checkpoint") {
return false
}
// Provider-specific 404 markers that gcerrors.Code may miss after a
// transitive bump. Match case-insensitively to defend against
// formatting drift across client versions ("NotFound" vs "notFound",
// "Not Found" with space, etc.):
// - GCS: "object doesn't exist" / "notFound" / "Error 404"
// - S3 v1: "NoSuchKey"
// - S3 v2: "api error NotFound" / "StatusCode: 404"
// - Azure: "BlobNotFound" / "ResourceNotFound"
//
// The "404" suffix is intentional and load-bearing: it's the HTTP
// status code that virtually every cloud-storage provider includes
// in the wrapped error for a missing object, regardless of the
// SDK's NotFound enum naming.
msgLower := strings.ToLower(msg)
for _, marker := range []string{
"object doesn't exist",
"notfound",
"nosuchkey",
"blobnotfound",
"resourcenotfound",
"statuscode: 404",
"error 404",
} {
if strings.Contains(msgLower, marker) {
return true
}
}
return false
}
84 changes: 84 additions & 0 deletions pkg/clouds/pulumi/create_stack_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package pulumi

import (
"errors"
"fmt"
"testing"
)

// Note: we deliberately don't test the gcerrors.Code() == NotFound branch
// here. Constructing a `*gcerr.Error` requires gocloud.dev/internal/gcerr
// which is an internal package; the gcerrors.Code lookup uses errors.As on
// that concrete type. Functional coverage of that branch comes from gocloud
// itself; what's regressed and needs unit coverage is the string-fallback
// path that the customer's deploy actually hit.

func TestStackCheckpointNotFound(t *testing.T) {
tests := []struct {
name string
err error
want bool
}{
{
name: "nil",
err: nil,
want: false,
},
{
name: "GCS 404 wrapped through Pulumi diy backend (the customer regression case)",
err: fmt.Errorf("failed to load checkpoint: %w",
errors.New(`blob (key ".pulumi/stacks/demo/wize-rooms-api.json") (code=Unknown): storage: object doesn't exist: googleapi: Error 404: No such object: likeclaw-simple-container-state/.pulumi/stacks/demo/wize-rooms-api.json, notFound`)),
want: true,
},
{
name: "GCS 404 without the 'failed to load checkpoint' prefix — out of scope, don't swallow",
err: errors.New(`storage: object doesn't exist: googleapi: Error 404`),
want: false,
},
{
name: "S3 v1 NoSuchKey wrapped through Pulumi diy backend",
err: fmt.Errorf("failed to load checkpoint: %w",
errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): NoSuchKey: The specified key does not exist`)),
want: true,
},
{
name: "S3 v2 SDK 'api error NotFound' wrapped through Pulumi diy backend",
err: fmt.Errorf("failed to load checkpoint: %w",
errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): operation error S3: HeadObject, https response error StatusCode: 404, RequestID: x, HostID: y, api error NotFound: Not Found`)),
want: true,
},
{
name: "Azure BlobNotFound wrapped through Pulumi diy backend",
err: fmt.Errorf("failed to load checkpoint: %w",
errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): BlobNotFound`)),
want: true,
},
{
name: "GCS NotFound with capitalized 'Not Found' (case-insensitivity guard)",
err: fmt.Errorf("failed to load checkpoint: %w",
errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): NotFound: object Not Found`)),
want: true,
},
{
name: "Generic 'StatusCode: 404' wrap (covers future client SDKs we don't enumerate)",
err: fmt.Errorf("failed to load checkpoint: %w",
errors.New(`blob (key ".pulumi/stacks/foo/bar.json") (code=Unknown): StatusCode: 404`)),
want: true,
},
{
name: "unrelated error containing 'failed to load checkpoint' but no NotFound marker",
err: fmt.Errorf("failed to load checkpoint: %w",
errors.New("permission denied: 403")),
want: false,
},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got := stackCheckpointNotFound(tc.err)
if got != tc.want {
t.Errorf("stackCheckpointNotFound(%q) = %v, want %v", tc.err, got, tc.want)
}
})
}
}
Loading