Skip to content

Commit

Permalink
add support for Network.Subdomain
Browse files Browse the repository at this point in the history
Problem: A user that wants to create jobs on the same network, or an existing
network, currently cannot, as the default with DNS enabled is to create
different networks.
Solution: Allow the user to specify a subdomain for the Network.

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed May 31, 2023
1 parent 321a9d6 commit 5952438
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 3 deletions.
5 changes: 5 additions & 0 deletions api/v1alpha1/jobset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ type Network struct {
// <jobSet.name>-<spec.replicatedJob.name>-<job-index>-<pod-index>.<jobSet.name>-<spec.replicatedJob.name>
// +optional
EnableDNSHostnames *bool `json:"enableDNSHostnames,omitempty"`

// Subdomain is an explicit choice for a network subdomain name
// When set, the replicated job is added to this network. This would allow sharing common networks
// +optional
Subdomain string `json:"subdomain,omitempty"`
}

// Operator defines the target of a SuccessPolicy or FailurePolicy.
Expand Down
5 changes: 5 additions & 0 deletions config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ spec:
fully qualified pod hostname, which is in the format:
<jobSet.name>-<spec.replicatedJob.name>-<job-index>-<pod-index>.<jobSet.name>-<spec.replicatedJob.name>'
type: boolean
subdomain:
description: Subdomain is an explicit choice for a network
subdomain name When set, the replicated job is added to
this network. This would allow sharing common networks
type: string
type: object
replicas:
default: 1
Expand Down
13 changes: 10 additions & 3 deletions pkg/controllers/jobset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,8 @@ func (r *JobSetReconciler) createJobs(ctx context.Context, js *jobset.JobSet, ow
func (r *JobSetReconciler) createHeadlessSvcIfNotExist(ctx context.Context, js *jobset.JobSet, rjob *jobset.ReplicatedJob) error {
log := ctrl.LoggerFrom(ctx)

// Check if service already exists. Service name is <jobSetName>-<replicatedJobName>.
// If the service does not exist, create it.
// Check if service already exists. Default service name is <jobSetName>-<replicatedJobName> unless otherwise specified.
// If a subdomain is provided already, it should also be in the same namespace.
var headlessSvc corev1.Service
subdomain := GenSubdomain(js, rjob)
if err := r.Get(ctx, types.NamespacedName{Name: subdomain, Namespace: js.Namespace}, &headlessSvc); err != nil {
Expand All @@ -390,6 +390,7 @@ func (r *JobSetReconciler) createHeadlessSvcIfNotExist(ctx context.Context, js *
Name: subdomain,
Namespace: js.Namespace,
},
// TODO how to handle replicated job selectors here when more than one job share a service?
Spec: corev1.ServiceSpec{
ClusterIP: "None",
Selector: map[string]string{
Expand Down Expand Up @@ -573,7 +574,8 @@ func constructJob(js *jobset.JobSet, rjob *jobset.ReplicatedJob, jobIdx int) (*b
labelAndAnnotateObject(&job.Spec.Template, js, rjob, jobIdx)

// If enableDNSHostnames is set, update job spec to set subdomain as
// job name (a headless service with same name as job will be created later).
// job name (a headless service with a chosen name or the
// same name as job will be created or found later).
if dnsHostnamesEnabled(rjob) {
job.Spec.Template.Spec.Subdomain = GenSubdomain(js, rjob)
}
Expand Down Expand Up @@ -681,6 +683,11 @@ func genJobName(js *jobset.JobSet, rjob *jobset.ReplicatedJob, jobIndex int) str
}

func GenSubdomain(js *jobset.JobSet, rjob *jobset.ReplicatedJob) string {

// If we have selected an explicit network name, use it
if rjob.Network.Subdomain != "" {
return rjob.Network.Subdomain
}
return fmt.Sprintf("%s-%s", js.Name, rjob.Name)
}

Expand Down
6 changes: 6 additions & 0 deletions pkg/util/testing/wrappers.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ func (r *ReplicatedJobWrapper) EnableDNSHostnames(val bool) *ReplicatedJobWrappe
return r
}

// NetworkSubdomain sets the value of ReplicatedJob.Network.Subdomain
func (r *ReplicatedJobWrapper) NetworkSubdomain(val string) *ReplicatedJobWrapper {
r.ReplicatedJob.Network.Subdomain = val
return r
}

// Replicas sets the value of the ReplicatedJob.Replicas.
func (r *ReplicatedJobWrapper) Replicas(val int) *ReplicatedJobWrapper {
r.ReplicatedJob.Replicas = val
Expand Down
42 changes: 42 additions & 0 deletions test/integration/controller/jobset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,18 @@ var _ = ginkgo.Describe("JobSet controller", func() {
},
},
}),
ginkgo.Entry("jobset with DNS hostnames enabled and shared network name should create 1 headless service and succeed when all jobs succeed", &testCase{
makeJobSet: sharedNetworkJobSet,
updates: []*update{
{
checkJobSetState: checkSharedService,
},
{
jobUpdateFn: completeAllJobs,
checkJobSetCondition: testutil.JobSetCompleted,
},
},
}),
ginkgo.Entry("succeeds from first run", &testCase{
makeJobSet: testJobSet,
updates: []*update{
Expand Down Expand Up @@ -813,6 +825,17 @@ func jobActive(job *batchv1.Job) bool {
return active
}

// Check one headless service across all replicated jobs
func checkSharedService(js *jobset.JobSet) {
gomega.Eventually(func() (int, error) {
var svcList corev1.ServiceList
if err := k8sClient.List(ctx, &svcList, client.InNamespace(js.Namespace)); err != nil {
return -1, err
}
return len(svcList.Items), nil
}).Should(gomega.Equal(1))
}

// 2 replicated jobs:
// - one with 1 replica
// - one with 3 replicas and DNS hostnames enabled
Expand All @@ -829,3 +852,22 @@ func testJobSet(ns *corev1.Namespace) *testing.JobSetWrapper {
Replicas(3).
Obj())
}

// 2 replicated jobs:
// - same as above, but with both having DNS enabled and a shared network
func sharedNetworkJobSet(ns *corev1.Namespace) *testing.JobSetWrapper {
return testing.MakeJobSet("shared-network-js", ns.Name).
SuccessPolicy(&jobset.SuccessPolicy{Operator: jobset.OperatorAll, TargetReplicatedJobs: []string{}}).
ReplicatedJob(testing.MakeReplicatedJob("replicated-job-a").
Job(testing.MakeJobTemplate("test-job-A", ns.Name).PodSpec(testing.TestPodSpec).Obj()).
EnableDNSHostnames(true).
NetworkSubdomain("shared-network").
Replicas(1).
Obj()).
ReplicatedJob(testing.MakeReplicatedJob("replicated-job-b").
Job(testing.MakeJobTemplate("test-job-B", ns.Name).PodSpec(testing.TestPodSpec).CompletionMode(batchv1.IndexedCompletion).Obj()).
NetworkSubdomain("shared-network").
EnableDNSHostnames(true).
Replicas(3).
Obj())
}

0 comments on commit 5952438

Please sign in to comment.