manager: synchronize task fileds with Manager 2.2 (#303)

Intensity in Manager is float64, but k8s runtime controller doesn't support floats. To workaround it, intensity becomes a string value, and validating webhook checks if provided value is a correct float string. Fixes #303
scylladb · Dec 17, 2020 · 043eb7a · 043eb7a
1 parent 5fd3b4c
commit 043eb7a
Show file tree

Hide file tree

Showing 9 changed files with 88 additions and 33 deletions.
diff --git a/config/operator/crd/bases/scylla.scylladb.com_scyllaclusters.yaml b/config/operator/crd/bases/scylla.scylladb.com_scyllaclusters.yaml
@@ -1421,10 +1421,12 @@ spec:
                   failFast:
                     description: FailFast stop repair on first error.
                     type: boolean
+                  host:
+                    description: Host to repair, by default all hosts are repaired
+                    type: string
                   intensity:
-                    description: Intensity integer >= 1 or a decimal between (0,1), higher values may result in higher speed and cluster load. 0 value means repair at maximum intensity.
-                    format: int64
-                    type: integer
+                    description: Intensity how many token ranges (per shard) to repair in a single Scylla repair job. By default this is 1. If you set it to 0 the number of token ranges is adjusted to the maximum supported by node (see max_repair_ranges_in_parallel in Scylla logs). Valid values are 0 and integers >= 1. Higher values will result in increased cluster load and slightly faster repairs. Changing the intensity impacts repair granularity if you need to resume it, the higher the value the more work on resume. For Scylla clusters that DO NOT SUPPORT ROW-LEVEL REPAIR, intensity can be a decimal between (0,1). In that case it specifies percent of shards that can be repaired in parallel on a repair master node. For Scylla clusters that are row-level repair enabled, setting intensity below 1 has the same effect as setting intensity 1.
+                    type: string
                   interval:
                     description: Interval task schedule interval e.g. 3d2h10m, valid units are d, h, m, s (default "0").
                     type: string
@@ -1441,7 +1443,7 @@ spec:
                     format: int64
                     type: integer
                   parallel:
-                    description: Parallel The maximum number of repair jobs to run in parallel, each node can participate in at most one repair at any given time. Default is means system will repair at maximum parallelism.
+                    description: 'Parallel the maximum number of Scylla repair jobs that can run at the same time (on different token ranges and replicas). Each node can take part in at most one repair at any given moment. By default the maximum possible parallelism is used. The effective parallelism depends on a keyspace replication factor (RF) and the number of nodes. The formula to calculate it is as follows: number of nodes / RF, ex. for 6 node cluster with RF=3 the maximum parallelism is 2.'
                     format: int64
                     type: integer
                   smallTableThreshold:
@@ -1591,12 +1593,14 @@ spec:
                   failFast:
                     description: FailFast stop repair on first error.
                     type: boolean
+                  host:
+                    description: Host to repair, by default all hosts are repaired
+                    type: string
                   id:
                     type: string
                   intensity:
-                    description: Intensity integer >= 1 or a decimal between (0,1), higher values may result in higher speed and cluster load. 0 value means repair at maximum intensity.
-                    format: int64
-                    type: integer
+                    description: Intensity how many token ranges (per shard) to repair in a single Scylla repair job. By default this is 1. If you set it to 0 the number of token ranges is adjusted to the maximum supported by node (see max_repair_ranges_in_parallel in Scylla logs). Valid values are 0 and integers >= 1. Higher values will result in increased cluster load and slightly faster repairs. Changing the intensity impacts repair granularity if you need to resume it, the higher the value the more work on resume. For Scylla clusters that DO NOT SUPPORT ROW-LEVEL REPAIR, intensity can be a decimal between (0,1). In that case it specifies percent of shards that can be repaired in parallel on a repair master node. For Scylla clusters that are row-level repair enabled, setting intensity below 1 has the same effect as setting intensity 1.
+                    type: string
                   interval:
                     description: Interval task schedule interval e.g. 3d2h10m, valid units are d, h, m, s (default "0").
                     type: string
@@ -1613,7 +1617,7 @@ spec:
                     format: int64
                     type: integer
                   parallel:
-                    description: Parallel The maximum number of repair jobs to run in parallel, each node can participate in at most one repair at any given time. Default is means system will repair at maximum parallelism.
+                    description: 'Parallel the maximum number of Scylla repair jobs that can run at the same time (on different token ranges and replicas). Each node can take part in at most one repair at any given moment. By default the maximum possible parallelism is used. The effective parallelism depends on a keyspace replication factor (RF) and the number of nodes. The formula to calculate it is as follows: number of nodes / RF, ex. for 6 node cluster with RF=3 the maximum parallelism is 2.'
                     format: int64
                     type: integer
                   smallTableThreshold:

diff --git a/docs/source/scylla_cluster_crd.md b/docs/source/scylla_cluster_crd.md
@@ -92,10 +92,17 @@ valid units are d, h, m, s (default "now").
 * `numRetries` - the number of times a scheduled task will retry to run before failing (default 3).
 * `dc` - list of datacenter glob patterns, e.g. `["dc1", "!otherdc*"]` used to specify the DCs to include or exclude from backup.
 * `failFast` - stop repair on first error.
-* `intensity` - integer >= 1 or a decimal between (0,1), higher values may result in higher speed and cluster load. 
-0 value means repair at maximum intensity.
-* `parallel` - The maximum number of repair jobs to run in parallel, each node can participate in at most one repair 
-at any given time. Default is means system will repair at maximum parallelism.
+* `intensity` - specifies how many token ranges (per shard) to repair in a single Scylla repair job. By default this is 1.
+  If you set it to 0 the number of token ranges is adjusted to the maximum supported by node (see max_repair_ranges_in_parallel in Scylla logs).
+  Valid values are 0 and integers >= 1. Higher values will result in increased cluster load and slightly faster repairs.
+  Changing the intensity impacts repair granularity if you need to resume it, the higher the value the more work on resume.
+  For Scylla clusters that **do not support row-level repair**, intensity can be a decimal between (0,1).
+  In that case it specifies percent of shards that can be repaired in parallel on a repair master node.
+  For Scylla clusters that are row-level repair enabled, setting intensity below 1 has the same effect as setting intensity 1.
+* `parallel` - specifies the maximum number of Scylla repair jobs that can run at the same time (on different token ranges and replicas).
+  Each node can take part in at most one repair at any given moment. By default the maximum possible parallelism is used.
+  The effective parallelism depends on a keyspace replication factor (RF) and the number of nodes.
+  The formula to calculate it is as follows: number of nodes / RF, ex. for 6 node cluster with RF=3 the maximum parallelism is 2.
 * `keyspace` - a list of keyspace/tables glob patterns, e.g. `["keyspace", "!keyspace.table_prefix_*"]`
 used to include or exclude keyspaces from repair.
 * `smallTableThreshold` - enable small table optimization for tables of size lower than given threshold.

diff --git a/examples/common/operator.yaml b/examples/common/operator.yaml
@@ -1436,10 +1436,12 @@ spec:
                   failFast:
                     description: FailFast stop repair on first error.
                     type: boolean
+                  host:
+                    description: Host to repair, by default all hosts are repaired
+                    type: string
                   intensity:
-                    description: Intensity integer >= 1 or a decimal between (0,1), higher values may result in higher speed and cluster load. 0 value means repair at maximum intensity.
-                    format: int64
-                    type: integer
+                    description: Intensity how many token ranges (per shard) to repair in a single Scylla repair job. By default this is 1. If you set it to 0 the number of token ranges is adjusted to the maximum supported by node (see max_repair_ranges_in_parallel in Scylla logs). Valid values are 0 and integers >= 1. Higher values will result in increased cluster load and slightly faster repairs. Changing the intensity impacts repair granularity if you need to resume it, the higher the value the more work on resume. For Scylla clusters that DO NOT SUPPORT ROW-LEVEL REPAIR, intensity can be a decimal between (0,1). In that case it specifies percent of shards that can be repaired in parallel on a repair master node. For Scylla clusters that are row-level repair enabled, setting intensity below 1 has the same effect as setting intensity 1.
+                    type: string
                   interval:
                     description: Interval task schedule interval e.g. 3d2h10m, valid units are d, h, m, s (default "0").
                     type: string
@@ -1456,7 +1458,7 @@ spec:
                     format: int64
                     type: integer
                   parallel:
-                    description: Parallel The maximum number of repair jobs to run in parallel, each node can participate in at most one repair at any given time. Default is means system will repair at maximum parallelism.
+                    description: 'Parallel the maximum number of Scylla repair jobs that can run at the same time (on different token ranges and replicas). Each node can take part in at most one repair at any given moment. By default the maximum possible parallelism is used. The effective parallelism depends on a keyspace replication factor (RF) and the number of nodes. The formula to calculate it is as follows: number of nodes / RF, ex. for 6 node cluster with RF=3 the maximum parallelism is 2.'
                     format: int64
                     type: integer
                   smallTableThreshold:
@@ -1606,12 +1608,14 @@ spec:
                   failFast:
                     description: FailFast stop repair on first error.
                     type: boolean
+                  host:
+                    description: Host to repair, by default all hosts are repaired
+                    type: string
                   id:
                     type: string
                   intensity:
-                    description: Intensity integer >= 1 or a decimal between (0,1), higher values may result in higher speed and cluster load. 0 value means repair at maximum intensity.
-                    format: int64
-                    type: integer
+                    description: Intensity how many token ranges (per shard) to repair in a single Scylla repair job. By default this is 1. If you set it to 0 the number of token ranges is adjusted to the maximum supported by node (see max_repair_ranges_in_parallel in Scylla logs). Valid values are 0 and integers >= 1. Higher values will result in increased cluster load and slightly faster repairs. Changing the intensity impacts repair granularity if you need to resume it, the higher the value the more work on resume. For Scylla clusters that DO NOT SUPPORT ROW-LEVEL REPAIR, intensity can be a decimal between (0,1). In that case it specifies percent of shards that can be repaired in parallel on a repair master node. For Scylla clusters that are row-level repair enabled, setting intensity below 1 has the same effect as setting intensity 1.
+                    type: string
                   interval:
                     description: Interval task schedule interval e.g. 3d2h10m, valid units are d, h, m, s (default "0").
                     type: string
@@ -1628,7 +1632,7 @@ spec:
                     format: int64
                     type: integer
                   parallel:
-                    description: Parallel The maximum number of repair jobs to run in parallel, each node can participate in at most one repair at any given time. Default is means system will repair at maximum parallelism.
+                    description: 'Parallel the maximum number of Scylla repair jobs that can run at the same time (on different token ranges and replicas). Each node can take part in at most one repair at any given moment. By default the maximum possible parallelism is used. The effective parallelism depends on a keyspace replication factor (RF) and the number of nodes. The formula to calculate it is as follows: number of nodes / RF, ex. for 6 node cluster with RF=3 the maximum parallelism is 2.'
                     format: int64
                     type: integer
                   smallTableThreshold:

diff --git a/pkg/api/v1alpha1/cluster_types.go b/pkg/api/v1alpha1/cluster_types.go
@@ -83,18 +83,27 @@ type RepairTaskSpec struct {
 	DC []string `json:"dc,omitempty" mapstructure:"dc,omitempty"`
 	// FailFast stop repair on first error.
 	FailFast *bool `json:"failFast,omitempty" mapstructure:"fail_fast,omitempty"`
-	// Intensity integer >= 1 or a decimal between (0,1), higher values may result in higher speed and cluster load.
-	// 0 value means repair at maximum intensity.
-	Intensity *int64 `json:"intensity,omitempty" mapstructure:"intensity,omitempty"`
-	// Parallel The maximum number of repair jobs to run in parallel, each node can participate in at most one repair
-	// at any given time. Default is means system will repair at maximum parallelism.
+	// Intensity how many token ranges (per shard) to repair in a single Scylla repair job. By default this is 1.
+	// If you set it to 0 the number of token ranges is adjusted to the maximum supported by node (see max_repair_ranges_in_parallel in Scylla logs).
+	// Valid values are 0 and integers >= 1. Higher values will result in increased cluster load and slightly faster repairs.
+	// Changing the intensity impacts repair granularity if you need to resume it, the higher the value the more work on resume.
+	// For Scylla clusters that *do not support row-level repair*, intensity can be a decimal between (0,1).
+	// In that case it specifies percent of shards that can be repaired in parallel on a repair master node.
+	// For Scylla clusters that are row-level repair enabled, setting intensity below 1 has the same effect as setting intensity 1.
+	Intensity *string `json:"intensity,omitempty" mapstructure:"intensity,omitempty"`
+	// Parallel the maximum number of Scylla repair jobs that can run at the same time (on different token ranges and replicas).
+	// Each node can take part in at most one repair at any given moment. By default the maximum possible parallelism is used.
+	// The effective parallelism depends on a keyspace replication factor (RF) and the number of nodes.
+	// The formula to calculate it is as follows: number of nodes / RF, ex. for 6 node cluster with RF=3 the maximum parallelism is 2.
 	Parallel *int64 `json:"parallel,omitempty" mapstructure:"parallel,omitempty"`
 	// Keyspace a list of keyspace/tables glob patterns, e.g. 'keyspace,!keyspace.table_prefix_*'
 	// used to include or exclude keyspaces from repair.
 	Keyspace []string `json:"keyspace,omitempty" mapstructure:"keyspace,omitempty"`
 	// SmallTableThreshold enable small table optimization for tables of size lower than given threshold.
 	// Supported units [B, MiB, GiB, TiB] (default "1GiB").
 	SmallTableThreshold *string `json:"smallTableThreshold,omitempty" mapstructure:"small_table_threshold,omitempty"`
+	// Host to repair, by default all hosts are repaired
+	Host *string `json:"host,omitempty" mapstructure:"host,omitempty"`
 }
 
 type BackupTaskSpec struct {

diff --git a/pkg/api/v1alpha1/cluster_validation.go b/pkg/api/v1alpha1/cluster_validation.go
@@ -2,6 +2,7 @@ package v1alpha1
 
 import (
 	"reflect"
+	"strconv"
 
 	"github.com/blang/semver"
 	"github.com/pkg/errors"
@@ -84,6 +85,15 @@ func checkValues(c *ScyllaCluster) error {
 		}
 	}
 
+	for _, r := range c.Spec.Repairs {
+		if r.Intensity != nil {
+			_, err := strconv.ParseFloat(*r.Intensity, 64)
+			if err != nil {
+				return errors.Errorf("invalid intensity %q in %q repair task, it must be a float value", *r.Intensity, r.Name)
+			}
+		}
+	}
+
 	return nil
 }
 

diff --git a/pkg/api/v1alpha1/cluster_validation_test.go b/pkg/api/v1alpha1/cluster_validation_test.go
@@ -8,10 +8,10 @@ import (
 	"github.com/stretchr/testify/require"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/utils/pointer"
 )
 
 func TestCheckValues(t *testing.T) {
-
 	validCluster := unit.NewSingleRackCluster(3)
 	validCluster.Spec.Datacenter.Racks[0].Resources = corev1.ResourceRequirements{
 		Limits: map[corev1.ResourceName]resource.Quantity{
@@ -23,6 +23,11 @@ func TestCheckValues(t *testing.T) {
 	sameName := validCluster.DeepCopy()
 	sameName.Spec.Datacenter.Racks = append(sameName.Spec.Datacenter.Racks, sameName.Spec.Datacenter.Racks[0])
 
+	invalidIntensity := validCluster.DeepCopy()
+	invalidIntensity.Spec.Repairs = append(invalidIntensity.Spec.Repairs, v1alpha1.RepairTaskSpec{
+		Intensity: pointer.StringPtr("100Mib"),
+	})
+
 	tests := []struct {
 		name    string
 		obj     *v1alpha1.ScyllaCluster
@@ -38,6 +43,11 @@ func TestCheckValues(t *testing.T) {
 			obj:     sameName,
 			allowed: false,
 		},
+		{
+			name:    "invalid intensity in repair task spec",
+			obj:     invalidIntensity,
+			allowed: false,
+		},
 	}
 
 	for _, test := range tests {

diff --git a/pkg/api/v1alpha1/cluster_webhook.go b/pkg/api/v1alpha1/cluster_webhook.go
@@ -73,6 +73,12 @@ func (c *ScyllaCluster) Default() {
 		if repairTask.SmallTableThreshold == nil {
 			c.Spec.Repairs[i].SmallTableThreshold = pointer.StringPtr("1GiB")
 		}
+		if repairTask.Intensity == nil {
+			c.Spec.Repairs[i].Intensity = pointer.StringPtr("1")
+		}
+		if repairTask.Parallel == nil {
+			c.Spec.Repairs[i].Parallel = pointer.Int64Ptr(0)
+		}
 	}
 
 	for i, backupTask := range c.Spec.Backups {

diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/pkg/controllers/manager/sync_test.go b/pkg/controllers/manager/sync_test.go
@@ -111,7 +111,7 @@ func TestManagerSynchronization(t *testing.T) {
 						},
 						DC:        []string{"dc1"},
 						FailFast:  pointer.BoolPtr(false),
-						Intensity: pointer.Int64Ptr(17),
+						Intensity: pointer.StringPtr("0.5"),
 						Keyspace:  []string{"keyspace1"},
 					},
 				},
@@ -180,7 +180,7 @@ func TestManagerSynchronization(t *testing.T) {
 							SchedulerTaskSpec: v1alpha1.SchedulerTaskSpec{
 								Name: "repair",
 							},
-							Intensity: pointer.Int64Ptr(666),
+							Intensity: pointer.StringPtr("666"),
 						},
 					},
 				},
@@ -197,7 +197,7 @@ func TestManagerSynchronization(t *testing.T) {
 							SchedulerTaskSpec: v1alpha1.SchedulerTaskSpec{
 								Name: "repair",
 							},
-							Intensity: pointer.Int64Ptr(123),
+							Intensity: pointer.StringPtr("123"),
 						},
 						ID: "repair-id",
 					},
@@ -214,7 +214,7 @@ func TestManagerSynchronization(t *testing.T) {
 						SchedulerTaskSpec: v1alpha1.SchedulerTaskSpec{
 							Name: "repair",
 						},
-						Intensity: pointer.Int64Ptr(666),
+						Intensity: pointer.StringPtr("666"),
 					},
 				},
 			},
@@ -227,7 +227,7 @@ func TestManagerSynchronization(t *testing.T) {
 							SchedulerTaskSpec: v1alpha1.SchedulerTaskSpec{
 								Name: "repair",
 							},
-							Intensity: pointer.Int64Ptr(666),
+							Intensity: pointer.StringPtr("666"),
 						},
 					},
 				},
@@ -244,7 +244,7 @@ func TestManagerSynchronization(t *testing.T) {
 							SchedulerTaskSpec: v1alpha1.SchedulerTaskSpec{
 								Name: "repair",
 							},
-							Intensity: pointer.Int64Ptr(666),
+							Intensity: pointer.StringPtr("666"),
 						},
 						ID: "repair-id",
 					},