Skip to content

Commit 7ad030a

Browse files
committed
fix: the etcd recovery client and tests
This is the follow-up fix to the PR #5129. 1. Correctly catch only expected errors in the tests. 2. Rewind the snapshot each time the upload is retried. 3. Correctly unwrap errors in the `EtcdRecovery` client. 4. Update the `grpc-proxy` library to pass through the EOF error. Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com> (cherry picked from commit f477507)
1 parent 4adae5e commit 7ad030a

File tree

4 files changed

+43
-11
lines changed

4 files changed

+43
-11
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ require (
9898
github.com/talos-systems/go-procfs v0.1.0
9999
github.com/talos-systems/go-retry v0.3.1
100100
github.com/talos-systems/go-smbios v0.1.1
101-
github.com/talos-systems/grpc-proxy v0.2.0
101+
github.com/talos-systems/grpc-proxy v0.3.0
102102
github.com/talos-systems/net v0.3.2
103103
github.com/talos-systems/siderolink v0.1.0
104104
github.com/talos-systems/talos/pkg/machinery v1.0.0-beta.2

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,8 +1255,8 @@ github.com/talos-systems/go-retry v0.3.1 h1:GjjyHB8i1CJpb1O5qYPMljq74cRQ5uiDoyMa
12551255
github.com/talos-systems/go-retry v0.3.1/go.mod h1:HiXQqyVStZ35uSY/MTLWVvQVmC3lIW2MS5VdDaMtoKM=
12561256
github.com/talos-systems/go-smbios v0.1.1 h1:Au6obB/Pp0i0JHhvPlzONk5aoNseosO2BUsmvWWi7y8=
12571257
github.com/talos-systems/go-smbios v0.1.1/go.mod h1:vk76naUSZaWE8Z95wbDn51FgH0goECM4oK3KY2hYSMU=
1258-
github.com/talos-systems/grpc-proxy v0.2.0 h1:DN75bLfaW4xfhq0r0mwFRnfGhSB+HPhK1LNzuMEs9Pw=
1259-
github.com/talos-systems/grpc-proxy v0.2.0/go.mod h1:sm97Vc/z2cok3pu6ruNeszQej4KDxFrDgfWs4C1mtC4=
1258+
github.com/talos-systems/grpc-proxy v0.3.0 h1:pM4P6GqzjmZzq6jAxeP2mzjuTbx9B2E5peVdi2bufM4=
1259+
github.com/talos-systems/grpc-proxy v0.3.0/go.mod h1:1TNaCLt0NTdFdz48OCnOM+HbPEIkj3DhCi85+z6zSM8=
12601260
github.com/talos-systems/net v0.3.2 h1:IMseRyuha8fNsv/3FbQPRE9hLVRBEFR+9sxcoETQ5vI=
12611261
github.com/talos-systems/net v0.3.2/go.mod h1:zhcGixNJz9dgwFiUwc7gkkAqdVqXagU1SNNoIVXYKGo=
12621262
github.com/talos-systems/siderolink v0.1.0 h1:7mkJ9EicQ8J9DHHkwiNYGoccCgFcEIFcmfcKRyI7Y+8=

internal/integration/api/etcd-recover.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"time"
1818

1919
"github.com/talos-systems/go-retry/retry"
20+
"google.golang.org/grpc/codes"
2021

2122
"github.com/talos-systems/talos/internal/integration/base"
2223
machineapi "github.com/talos-systems/talos/pkg/machinery/api/machine"
@@ -156,7 +157,7 @@ func (suite *EtcdRecoverSuite) TestSnapshotRecover() {
156157

157158
suite.T().Logf("recovering etcd snapshot at node %q", recoverNode)
158159

159-
suite.Require().NoError(suite.recoverEtcd(recoverNode, &snapshot))
160+
suite.Require().NoError(suite.recoverEtcd(recoverNode, bytes.NewReader(snapshot.Bytes())))
160161

161162
suite.AssertClusterHealthy(suite.ctx)
162163

@@ -197,21 +198,40 @@ func (suite *EtcdRecoverSuite) snapshotEtcd(snapshotNode string, dest io.Writer)
197198
return err
198199
}
199200

200-
func (suite *EtcdRecoverSuite) recoverEtcd(recoverNode string, src io.Reader) error {
201+
func (suite *EtcdRecoverSuite) recoverEtcd(recoverNode string, src io.ReadSeeker) error {
201202
ctx := client.WithNodes(suite.ctx, recoverNode)
202203

204+
suite.T().Log("uploading the snapshot")
205+
203206
if err := retry.Constant(time.Minute, retry.WithUnits(time.Millisecond*200)).RetryWithContext(ctx, func(ctx context.Context) error {
204-
_, err := suite.Client.EtcdRecover(ctx, src)
207+
_, err := src.Seek(0, io.SeekStart)
208+
if err != nil {
209+
return err
210+
}
211+
212+
_, err = suite.Client.EtcdRecover(ctx, src)
213+
214+
if client.StatusCode(err) == codes.FailedPrecondition {
215+
return retry.ExpectedError(err)
216+
}
205217

206-
return retry.ExpectedError(err)
218+
return err
207219
}); err != nil {
208220
return fmt.Errorf("error uploading snapshot: %w", err)
209221
}
210222

223+
suite.T().Log("bootstrapping from the snapshot")
224+
211225
return retry.Constant(time.Minute, retry.WithUnits(time.Millisecond*200)).RetryWithContext(ctx, func(ctx context.Context) error {
212-
return retry.ExpectedError(suite.Client.Bootstrap(ctx, &machineapi.BootstrapRequest{
226+
err := suite.Client.Bootstrap(ctx, &machineapi.BootstrapRequest{
213227
RecoverEtcd: true,
214-
}))
228+
})
229+
230+
if client.StatusCode(err) == codes.FailedPrecondition || client.StatusCode(err) == codes.DeadlineExceeded {
231+
return retry.ExpectedError(err)
232+
}
233+
234+
return err
215235
})
216236
}
217237

pkg/machinery/client/client.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -907,7 +907,9 @@ func (c *Client) EtcdRecover(ctx context.Context, snapshot io.Reader, callOption
907907
default:
908908
}
909909

910-
n, err := snapshot.Read(buf)
910+
var n int
911+
912+
n, err = snapshot.Read(buf)
911913
if err != nil {
912914
if errors.Is(err, io.EOF) {
913915
break
@@ -919,11 +921,21 @@ func (c *Client) EtcdRecover(ctx context.Context, snapshot io.Reader, callOption
919921
if err = cli.Send(&common.Data{
920922
Bytes: buf[:n],
921923
}); err != nil {
924+
if errors.Is(err, io.EOF) {
925+
break
926+
}
927+
922928
return nil, err
923929
}
924930
}
925931

926-
return cli.CloseAndRecv()
932+
resp, err := cli.CloseAndRecv()
933+
934+
var filtered interface{}
935+
filtered, err = FilterMessages(resp, err)
936+
resp, _ = filtered.(*machineapi.EtcdRecoverResponse) //nolint:errcheck
937+
938+
return resp, err
927939
}
928940

929941
// GenerateClientConfiguration implements proto.MachineServiceClient interface.

0 commit comments

Comments
 (0)