From fef17f13c4d30539a8a809c1e8fec52805bd0b12 Mon Sep 17 00:00:00 2001 From: Mauritz Uphoff Date: Tue, 23 Sep 2025 18:07:31 +0200 Subject: [PATCH] feat: return cluster error messages on wait handler --- CHANGELOG.md | 5 ++++ services/ske/CHANGELOG.md | 3 ++ services/ske/VERSION | 2 +- services/ske/wait/wait.go | 24 +++++++++++++++ services/ske/wait/wait_test.go | 53 +++++++++++++++++++++++++++++++++- 5 files changed, 85 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d54c74d22..d8118b462 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,11 @@ - Add field `Labels` (type `*map[string]string`) to structs `LoadBalancer`, `CreateLoadBalancerPayload`, `UpdateLoadBalancerPayload` - `git`: [v0.8.0](services/git/CHANGELOG.md#v080) - **Feature:** Add support for the instance patch operation +- `ske` + - [v1.12.0](services/ske/CHANGELOG.md#v141) + - **Feature:** Exit `CreateOrUpdateClusterWaitHandler` early when the cluster reports structured errors and is in a failure state. + + ## Release (2025-09-11) - `cdn`: [v1.5.0](services/cdn/CHANGELOG.md#v150) diff --git a/services/ske/CHANGELOG.md b/services/ske/CHANGELOG.md index 405609ce5..01620af85 100644 --- a/services/ske/CHANGELOG.md +++ b/services/ske/CHANGELOG.md @@ -1,3 +1,6 @@ +## v1.4.1 +- **Feature:** Exit `CreateOrUpdateClusterWaitHandler` early when the cluster reports structured errors and is in a failure state. + ## v1.4.0 - **Feature:** Add new field `Kubernetes` to `Nodepool` model diff --git a/services/ske/VERSION b/services/ske/VERSION index 0d0c52f84..66d62a800 100644 --- a/services/ske/VERSION +++ b/services/ske/VERSION @@ -1 +1 @@ -v1.4.0 +v1.4.1 diff --git a/services/ske/wait/wait.go b/services/ske/wait/wait.go index e4745b09a..ec5d4330a 100644 --- a/services/ske/wait/wait.go +++ b/services/ske/wait/wait.go @@ -33,6 +33,8 @@ const ( CredentialsRotationStateCompleted = "COMPLETED" // Deprecated: InvalidArgusInstanceErrorCode is deprecated and will be removed after 14th November 2025. Use [ske.RUNTIMEERRORCODE_OBSERVABILITY_INSTANCE_NOT_FOUND] instead. InvalidArgusInstanceErrorCode = "SKE_ARGUS_INSTANCE_NOT_FOUND" + + ClusterNoValidHostFound = "SKE_NODE_NO_VALID_HOST_FOUND" ) type APIClientClusterInterface interface { @@ -42,6 +44,8 @@ type APIClientClusterInterface interface { // CreateOrUpdateClusterWaitHandler will wait for cluster creation or update func CreateOrUpdateClusterWaitHandler(ctx context.Context, a APIClientClusterInterface, projectId, region, name string) *wait.AsyncActionHandler[ske.Cluster] { + startTime := time.Now() + handler := wait.New(func() (waitFinished bool, response *ske.Cluster, err error) { s, err := a.GetClusterExecute(ctx, projectId, region, name) if err != nil { @@ -56,6 +60,26 @@ func CreateOrUpdateClusterWaitHandler(ctx context.Context, a APIClientClusterInt return true, s, nil } + // If cluster is UNSPECIFIED or UNHEALTHY and has structured errors, exit early + hasStructuredErrors := s.Status.Errors != nil && len(*s.Status.Errors) > 0 + if (state == ske.CLUSTERSTATUSSTATE_UNSPECIFIED || state == ske.CLUSTERSTATUSSTATE_UNHEALTHY) && hasStructuredErrors { + for _, clusterError := range *s.Status.Errors { + if clusterError.Code != nil && clusterError.Message != nil { + return true, s, nil + } + } + } + + // Waiter has been running more than 15 minutes and cluster is still in CREATING or RECONCILING state with errors + if time.Since(startTime) > 15*time.Minute && + (state == ske.CLUSTERSTATUSSTATE_CREATING || state == ske.CLUSTERSTATUSSTATE_RECONCILING) && hasStructuredErrors { + for _, clusterError := range *s.Status.Errors { + if clusterError.Code != nil && clusterError.Message != nil { + return true, s, nil + } + } + } + if state == ske.CLUSTERSTATUSSTATE_HEALTHY || state == ske.CLUSTERSTATUSSTATE_HIBERNATED { return true, s, nil } diff --git a/services/ske/wait/wait_test.go b/services/ske/wait/wait_test.go index 508718443..5eea52704 100644 --- a/services/ske/wait/wait_test.go +++ b/services/ske/wait/wait_test.go @@ -18,6 +18,7 @@ type apiClientClusterMocked struct { name string resourceState ske.ClusterStatusState invalidArgusInstance bool + errorList *[]ske.ClusterError } const testRegion = "eu01" @@ -45,7 +46,17 @@ func (a *apiClientClusterMocked) GetClusterExecute(_ context.Context, _, _, _ st return &ske.Cluster{ Name: utils.Ptr("cluster"), Status: &ske.ClusterStatus{ - Aggregated: &rs, + Aggregated: utils.Ptr(rs), + Error: func() *ske.RuntimeError { + if a.invalidArgusInstance { + return &ske.RuntimeError{ + Code: utils.Ptr(ske.RUNTIMEERRORCODE_OBSERVABILITY_INSTANCE_NOT_FOUND), + Message: utils.Ptr("invalid argus instance"), + } + } + return nil + }(), + Errors: a.errorList, }, }, nil } @@ -77,6 +88,7 @@ func TestCreateOrUpdateClusterWaitHandler(t *testing.T) { invalidArgusInstance bool wantErr bool wantResp bool + errorList *[]ske.ClusterError }{ { desc: "create_succeeded", @@ -120,6 +132,40 @@ func TestCreateOrUpdateClusterWaitHandler(t *testing.T) { wantErr: true, wantResp: false, }, + { + desc: "status_errors_present_state_unhealthy", + getFails: false, + resourceState: ske.CLUSTERSTATUSSTATE_UNHEALTHY, + errorList: &[]ske.ClusterError{ + { + Code: utils.Ptr("ERR_CODE"), + Message: utils.Ptr("Error 1"), + }, + { + Code: utils.Ptr("ERR_OTHER"), + Message: utils.Ptr("Error 2"), + }, + }, + wantErr: false, + wantResp: true, + }, + { + desc: "status_errors_present_state_unspecified", + getFails: false, + resourceState: ske.CLUSTERSTATUSSTATE_UNSPECIFIED, + errorList: &[]ske.ClusterError{ + { + Code: utils.Ptr("ERR_CODE"), + Message: utils.Ptr("Error 1"), + }, + { + Code: utils.Ptr("ERR_OTHER"), + Message: utils.Ptr("Error 2"), + }, + }, + wantErr: false, + wantResp: true, + }, } for _, tt := range tests { t.Run(tt.desc, func(t *testing.T) { @@ -130,6 +176,7 @@ func TestCreateOrUpdateClusterWaitHandler(t *testing.T) { name: name, resourceState: tt.resourceState, invalidArgusInstance: tt.invalidArgusInstance, + errorList: tt.errorList, } var wantRes *ske.Cluster rs := ske.ClusterStatusState(tt.resourceState) @@ -147,6 +194,10 @@ func TestCreateOrUpdateClusterWaitHandler(t *testing.T) { Message: utils.Ptr("invalid argus instance"), } } + + if tt.errorList != nil && len(*tt.errorList) > 0 { + wantRes.Status.Errors = tt.errorList + } } handler := CreateOrUpdateClusterWaitHandler(context.Background(), apiClient, "", testRegion, name)