Skip to content

Commit

Permalink
Merge pull request #1312 from zimnx/mz/replace-node-uuid
Browse files Browse the repository at this point in the history
Replace ScyllaCluster nodes using Host ID
  • Loading branch information
zimnx committed Aug 17, 2023
2 parents fac3da6 + 98f603f commit 7157fce
Show file tree
Hide file tree
Showing 374 changed files with 51,564 additions and 189 deletions.
2 changes: 1 addition & 1 deletion examples/common/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2965,7 +2965,7 @@ spec:
replace_address_first_boot:
additionalProperties:
type: string
description: replace_address_first_boot holds addresses which should be replaced by new nodes.
description: 'replace_address_first_boot holds addresses which should be replaced by new nodes. DEPRECATED: since Scylla Operator 1.10 it''s only used for deprecated replace node procedure (ScyllaDB OS <5.2, Enterprise <2023.1). With Scylla Operator 1.11+ this field may be empty.'
type: object
stale:
description: stale indicates if the current rack status is collected for a previous generation. stale should eventually become false when the appropriate controller writes a fresh status.
Expand Down
2 changes: 1 addition & 1 deletion pkg/api/scylla/v1/scylla.scylladb.com_scyllaclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2009,7 +2009,7 @@ spec:
replace_address_first_boot:
additionalProperties:
type: string
description: replace_address_first_boot holds addresses which should be replaced by new nodes.
description: 'replace_address_first_boot holds addresses which should be replaced by new nodes. DEPRECATED: since Scylla Operator 1.10 it''s only used for deprecated replace node procedure (ScyllaDB OS <5.2, Enterprise <2023.1). With Scylla Operator 1.11+ this field may be empty.'
type: object
stale:
description: stale indicates if the current rack status is collected for a previous generation. stale should eventually become false when the appropriate controller writes a fresh status.
Expand Down
5 changes: 2 additions & 3 deletions pkg/api/scylla/v1/types_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -493,10 +493,9 @@ type RackStatus struct {
// conditions are the latest available observations of a rack's state.
Conditions []RackCondition `json:"conditions,omitempty"`

// FIXME: The json value should have been a camelCase string.
// We need to deprecate this value and introduce a new one.

// replace_address_first_boot holds addresses which should be replaced by new nodes.
// DEPRECATED: since Scylla Operator 1.10 it's only used for deprecated replace node procedure (ScyllaDB OS <5.2, Enterprise <2023.1).
// With Scylla Operator 1.11+ this field may be empty.
ReplaceAddressFirstBoot map[string]string `json:"replace_address_first_boot,omitempty"`
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/tests/tests_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,15 @@ var suites = ginkgotest.TestSuites{
Description: templates.LongDesc(`
Runs all tests.
`),
DefaultParallelism: 15,
DefaultParallelism: 8,
},
{
Name: "scylla-operator/conformance/parallel",
Description: templates.LongDesc(`
Tests that ensure an Scylla Operator is working properly.
`),
LabelFilter: fmt.Sprintf("!%s", framework.SerialLabelName),
DefaultParallelism: 15,
DefaultParallelism: 8,
},
{
Name: "scylla-operator/conformance/serial",
Expand Down
566 changes: 392 additions & 174 deletions pkg/controller/scyllacluster/sync_services.go

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pkg/naming/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ const (
// ReplaceLabel express the intent to replace pod under the specific member.
ReplaceLabel = "scylla/replace"

// ReplacingNodeHostIDLabel contains the Host ID of node labelled node is replacing.
ReplacingNodeHostIDLabel = "internal.scylla-operator.scylladb.com/replacing-node-hostid"

// NodeMaintenanceLabel means that node is under maintenance.
// Readiness check will always fail when this label is added to member service.
NodeMaintenanceLabel = "scylla/node-maintenance"
Expand Down
73 changes: 73 additions & 0 deletions pkg/scyllaclient/config_client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Copyright (c) 2023 ScyllaDB.

package scyllaclient

import (
"context"
"fmt"
"net"
"net/http"
"time"

api "github.com/go-openapi/runtime/client"
"github.com/go-openapi/strfmt"
"github.com/scylladb/scylla-operator/pkg/auth"
scyllaclient "github.com/scylladb/scylladb-swagger-go-client/scylladb/gen/v2/client"
"github.com/scylladb/scylladb-swagger-go-client/scylladb/gen/v2/client/config"
)

const (
agentPort = "10001"
defaultTimeout = 30 * time.Second
)

type ConfigClient struct {
client *scyllaclient.ScylladbV2
}

func NewConfigClient(host, authToken string) *ConfigClient {
var transport http.RoundTripper = DefaultTransport()
transport = fixContentType(transport)
transport = auth.AddToken(transport, authToken)

client := &http.Client{
Timeout: defaultTimeout,
Transport: transport,
}

host = net.JoinHostPort(host, agentPort)

scyllaV2Runtime := api.NewWithClient(
host, scyllaclient.DefaultBasePath, scyllaclient.DefaultSchemes, client,
)

return &ConfigClient{
client: scyllaclient.New(scyllaV2Runtime, strfmt.Default),
}
}

func (c *ConfigClient) BroadcastAddress(ctx context.Context) (string, error) {
resp, err := c.client.Config.FindConfigBroadcastAddress(config.NewFindConfigBroadcastAddressParamsWithContext(ctx))
if err != nil {
return "", fmt.Errorf("can't get broadcast_address: %w", err)
}
return resp.Payload, nil
}

// ReplaceAddressFirstBoot returns value of "replace_address_first_boot" config parameter.
func (c *ConfigClient) ReplaceAddressFirstBoot(ctx context.Context) (string, error) {
resp, err := c.client.Config.FindConfigReplaceAddressFirstBoot(config.NewFindConfigReplaceAddressFirstBootParamsWithContext(ctx))
if err != nil {
return "", fmt.Errorf("can't get replace_address_first_boot: %w", err)
}
return resp.Payload, nil
}

// ReplaceNodeFirstBoot returns value of "replace_node_first_boot" config parameter.
func (c *ConfigClient) ReplaceNodeFirstBoot(ctx context.Context) (string, error) {
resp, err := c.client.Config.FindConfigReplaceNodeFirstBoot(config.NewFindConfigReplaceNodeFirstBootParamsWithContext(ctx))
if err != nil {
return "", fmt.Errorf("can't get replace_node_first_boot: %w", err)
}
return resp.Payload, nil
}
62 changes: 62 additions & 0 deletions pkg/scyllafeatures/scyllafeatures.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright (c) 2023 ScyllaDB.

package scyllafeatures

import (
"fmt"

"github.com/blang/semver"
scyllav1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1"
)

var (
scyllaEnterpriseMinimalVersion = semver.MustParse("2000.0.0")
)

type ScyllaFeature string

const (
ReplacingNodeUsingHostID ScyllaFeature = "ReplacingNodeUsingHostID"
)

type scyllaDBVersionMinimalConstraint struct {
openSource semver.Version
enterprise semver.Version
}

var featureMinimalVersionConstraints = map[ScyllaFeature]scyllaDBVersionMinimalConstraint{
ReplacingNodeUsingHostID: {
openSource: semver.MustParse("5.2.0"),
enterprise: semver.MustParse("2023.1.0"),
},
}

func Supports(sc *scyllav1.ScyllaCluster, feature ScyllaFeature) (bool, error) {
constraints, ok := featureMinimalVersionConstraints[feature]
if !ok {
return false, fmt.Errorf("unable to find minimal version constraints, unknown feature %q", feature)
}

version, err := semver.Parse(sc.Spec.Version)
if err != nil {
return false, fmt.Errorf("can't parse ScyllaCluster version %q: %w", sc.Spec.Version, err)
}

if isOpenSource(version) && version.GTE(constraints.openSource) {
return true, nil
}

if isEnterprise(version) && version.GTE(constraints.enterprise) {
return true, nil
}

return false, nil
}

func isEnterprise(v semver.Version) bool {
return v.GTE(scyllaEnterpriseMinimalVersion)
}

func isOpenSource(v semver.Version) bool {
return v.LT(scyllaEnterpriseMinimalVersion)
}
14 changes: 13 additions & 1 deletion pkg/sidecar/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,20 @@ func (s *ScyllaConfig) setupEntrypoint(ctx context.Context) (*exec.Cmd, error) {
}
// If node is being replaced
if addr, ok := m.ServiceLabels[naming.ReplaceLabel]; ok {
args["replace-address-first-boot"] = pointer.StringPtr(addr)
if len(addr) == 0 {
klog.Warningf("Service %q have unexpectedly empty label %q, skipping replace", m.Name, naming.ReplaceLabel)
} else {
args["replace-address-first-boot"] = pointer.StringPtr(addr)
}
}
if hostID, ok := m.ServiceLabels[naming.ReplacingNodeHostIDLabel]; ok {
if len(hostID) == 0 {
klog.Warningf("Service %q have unexpectedly empty label %q, skipping replace", m.Name, naming.ReplacingNodeHostIDLabel)
} else {
args["replace-node-first-boot"] = pointer.String(hostID)
}
}

// See if we need to use cpu-pinning
// TODO: Add more checks to make sure this is valid.
// eg. parse the cpuset and check the number of cpus is the same as cpu limits
Expand Down
98 changes: 91 additions & 7 deletions test/e2e/set/scyllacluster/scyllacluster_replace.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
"github.com/scylladb/scylla-operator/pkg/naming"
"github.com/scylladb/scylla-operator/pkg/scyllaclient"
scyllafixture "github.com/scylladb/scylla-operator/test/e2e/fixture/scylla"
"github.com/scylladb/scylla-operator/test/e2e/framework"
"github.com/scylladb/scylla-operator/test/e2e/utils"
Expand All @@ -18,17 +19,61 @@ import (
"k8s.io/apimachinery/pkg/types"
)

var _ = g.Describe("ScyllaCluster replace", func() {
var _ = g.Describe("ScyllaCluster", func() {
defer g.GinkgoRecover()

f := framework.NewFramework("scyllacluster")

g.It("should replace a node", func() {
const (
scyllaOSImageRepository = "docker.io/scylladb/scylla"
scyllaEnterpriseImageRepository = "docker.io/scylladb/scylla-enterprise"
)

validateReplaceViaClusterIPAddress := func(ctx context.Context, configClient *scyllaclient.ConfigClient, preReplaceService *corev1.Service) error {
replaceAddressFirstBoot, err := configClient.ReplaceAddressFirstBoot(ctx)
if err != nil {
return fmt.Errorf("can't get replace_address_first_boot config parameter: %w", err)
}

if replaceAddressFirstBoot != preReplaceService.Spec.ClusterIP {
return fmt.Errorf("unexpected value of replace_address_first_boot scylla config, expected %q, got %q", preReplaceService.Spec.ClusterIP, replaceAddressFirstBoot)
}

return nil
}

validateReplaceViaHostID := func(ctx context.Context, configClient *scyllaclient.ConfigClient, preReplaceService *corev1.Service) error {
replaceNodeFirstBoot, err := configClient.ReplaceNodeFirstBoot(ctx)
if err != nil {
return fmt.Errorf("can't get replace_node_first_boot config parameter: %w", err)
}

if replaceNodeFirstBoot != preReplaceService.Annotations[naming.HostIDAnnotation] {
return fmt.Errorf("unexpected value of replace_node_first_boot scylla config, expected %q, got %q", preReplaceService.Annotations[naming.HostIDAnnotation], replaceNodeFirstBoot)
}

return nil
}

type entry struct {
procedure string
scyllaImageRepository string
scyllaVersion string
validateScyllaConfig func(context.Context, *scyllaclient.ConfigClient, *corev1.Service) error
}

describeEntry := func(e *entry) string {
return fmt.Sprintf(`using %s based procedure when version of ScyllaDB is "%s:%s"`, e.procedure, e.scyllaImageRepository, e.scyllaVersion)
}

g.DescribeTable("should replace a node", func(e *entry) {
ctx, cancel := context.WithTimeout(context.Background(), testTimeout)
defer cancel()

sc := scyllafixture.BasicScyllaCluster.ReadOrFail()
sc.Spec.Datacenter.Racks[0].Members = 2
sc.Spec.Repository = e.scyllaImageRepository
sc.Spec.Version = e.scyllaVersion
sc.Spec.Datacenter.Racks[0].Members = 3

framework.By("Creating a ScyllaCluster")
sc, err := f.ScyllaClient().ScyllaV1().ScyllaClusters(f.Namespace()).Create(ctx, sc, metav1.CreateOptions{})
Expand All @@ -42,10 +87,15 @@ var _ = g.Describe("ScyllaCluster replace", func() {

verifyScyllaCluster(ctx, f.KubeClient(), sc)
hosts := getScyllaHostsAndWaitForFullQuorum(ctx, f.KubeClient().CoreV1(), sc)
o.Expect(hosts).To(o.HaveLen(2))
o.Expect(hosts).To(o.HaveLen(int(utils.GetMemberCount(sc))))
di := insertAndVerifyCQLData(ctx, hosts)
defer di.Close()

replacedNodeService, err := f.KubeClient().CoreV1().Services(sc.Namespace).Get(ctx, utils.GetNodeName(sc, 0), metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())

preReplaceService := replacedNodeService.DeepCopy()

framework.By("Replacing a node #0")
pod, err := f.KubeClient().CoreV1().Pods(f.Namespace()).Get(
ctx,
Expand Down Expand Up @@ -91,7 +141,7 @@ var _ = g.Describe("ScyllaCluster replace", func() {
client, _, err := utils.GetScyllaClient(ctx, f.KubeClient().CoreV1(), sc)
o.Expect(err).NotTo(o.HaveOccurred())

replacedNodeService, err := f.KubeClient().CoreV1().Services(sc.Namespace).Get(ctx, utils.GetNodeName(sc, 0), metav1.GetOptions{})
replacedNodeService, err = f.KubeClient().CoreV1().Services(sc.Namespace).Get(ctx, utils.GetNodeName(sc, 0), metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())

otherNodeService, err := f.KubeClient().CoreV1().Services(sc.Namespace).Get(ctx, utils.GetNodeName(sc, 1), metav1.GetOptions{})
Expand All @@ -106,9 +156,43 @@ var _ = g.Describe("ScyllaCluster replace", func() {
oldHosts := hosts
hosts = getScyllaHostsAndWaitForFullQuorum(ctx, f.KubeClient().CoreV1(), sc)
o.Expect(hosts).To(o.HaveLen(len(oldHosts)))
o.Expect(hosts).NotTo(o.ConsistOf(oldHosts))
err = di.SetClientEndpoints(hosts)
o.Expect(err).NotTo(o.HaveOccurred())
verifyCQLData(ctx, di)
})

framework.By("Verifying ScyllaDB config")

configClient, err := utils.GetScyllaConfigClient(ctx, f.KubeClient().CoreV1(), sc, replacedNodeService.Spec.ClusterIP)
o.Expect(err).NotTo(o.HaveOccurred())

err = e.validateScyllaConfig(ctx, configClient, preReplaceService)
o.Expect(err).NotTo(o.HaveOccurred())
},
g.Entry(describeEntry, &entry{
procedure: "ClusterIP",
scyllaImageRepository: scyllaOSImageRepository,
scyllaVersion: "5.1.15",
validateScyllaConfig: validateReplaceViaClusterIPAddress,
}),
g.Entry(describeEntry, &entry{
procedure: "ClusterIP",
scyllaImageRepository: scyllaEnterpriseImageRepository,
scyllaVersion: "2022.2.12",
validateScyllaConfig: validateReplaceViaClusterIPAddress,
}),
g.Entry(describeEntry, &entry{
procedure: "HostID",
scyllaImageRepository: scyllaOSImageRepository,
scyllaVersion: "5.2.0",
validateScyllaConfig: validateReplaceViaHostID,
}),
// TODO: Enable test when ScyllaDB Enterprise 2023.1 is released
// Ref: https://github.com/scylladb/scylla-operator/issues/1325
g.PEntry(describeEntry, &entry{
procedure: "HostID",
scyllaImageRepository: scyllaEnterpriseImageRepository,
scyllaVersion: "2023.1.0",
validateScyllaConfig: validateReplaceViaHostID,
}),
)
})
15 changes: 15 additions & 0 deletions test/e2e/utils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,21 @@ func GetScyllaClient(ctx context.Context, client corev1client.CoreV1Interface, s
return scyllaClient, hosts, nil
}

func GetScyllaConfigClient(ctx context.Context, client corev1client.CoreV1Interface, sc *scyllav1.ScyllaCluster, host string) (*scyllaclient.ConfigClient, error) {
tokenSecret, err := client.Secrets(sc.Namespace).Get(ctx, naming.AgentAuthTokenSecretName(sc.Name), metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("can't get Secret %q: %w", naming.ManualRef(sc.Namespace, naming.AgentAuthTokenSecretName(sc.Name)), err)
}

authToken, err := helpers.GetAgentAuthTokenFromSecret(tokenSecret)
if err != nil {
return nil, fmt.Errorf("can't get auth token: %w", err)
}

configClient := scyllaclient.NewConfigClient(host, authToken)
return configClient, nil
}

func GetHostsAndUUIDs(ctx context.Context, client corev1client.CoreV1Interface, sc *scyllav1.ScyllaCluster) ([]string, []string, error) {
serviceList, err := client.Services(sc.Namespace).List(ctx, metav1.ListOptions{
LabelSelector: GetMemberServiceSelector(sc.Name).String(),
Expand Down

0 comments on commit 7157fce

Please sign in to comment.