Skip to content

Commit

Permalink
updates from review
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed Jun 12, 2023
1 parent 57e1fb2 commit 7f341b9
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 59 deletions.
6 changes: 4 additions & 2 deletions api/jobset/v1alpha2/jobset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,15 @@ type ReplicatedJob struct {

type Network struct {
// EnableDNSHostnames allows pods to be reached via their hostnames.
// Pods will be reachable using the fully qualified pod hostname.
// Pods will be reachable using the fully qualified pod hostname:
// <jobSet.name>-<spec.replicatedJob.name>-<job-index>-<pod-index>.<subdomain>
// Child jobs will share the same subdomain whether or not Subdomain is set.
// +optional
EnableDNSHostnames *bool `json:"enableDNSHostnames,omitempty"`

// Subdomain is an explicit choice for a network subdomain name
// When set, any replicated job in the set is added to this network.
// If Subdomain is not set, we use <jobSet.name>-
// Defaults to <jobSet.name> if not set.
// +optional
Subdomain string `json:"subdomain,omitempty"`
}
Expand Down
4 changes: 4 additions & 0 deletions api/jobset/v1alpha2/jobset_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ func (js *JobSet) Default() {
if js.Spec.Network.EnableDNSHostnames == nil {
js.Spec.Network.EnableDNSHostnames = pointer.Bool(true)
}
// Subdomain defaults to the JobSet name
if js.Spec.Network.Subdomain == "" {
js.Spec.Network.Subdomain = js.Name
}
// Default pod restart policy to OnFailure.
if js.Spec.ReplicatedJobs[i].Template.Spec.Template.Spec.RestartPolicy == "" {
js.Spec.ReplicatedJobs[i].Template.Spec.Template.Spec.RestartPolicy = corev1.RestartPolicyOnFailure
Expand Down
8 changes: 5 additions & 3 deletions config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,16 @@ spec:
description: Network defines the networking options for the jobset.
properties:
enableDNSHostnames:
description: EnableDNSHostnames allows pods to be reached via
description: 'EnableDNSHostnames allows pods to be reached via
their hostnames. Pods will be reachable using the fully qualified
pod hostname.
pod hostname: <jobSet.name>-<spec.replicatedJob.name>-<job-index>-<pod-index>.<subdomain>
Child jobs will share the same subdomain whether or not Subdomain
is set.'
type: boolean
subdomain:
description: Subdomain is an explicit choice for a network subdomain
name When set, any replicated job in the set is added to this
network. If Subdomain is not set, we use <jobSet.name>-
network. Defaults to <jobSet.name> if not set.
type: string
type: object
replicatedJobs:
Expand Down
26 changes: 12 additions & 14 deletions pkg/controllers/jobset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,19 +351,18 @@ func (r *JobSetReconciler) resumeJobSetIfNecessary(ctx context.Context, js *jobs
func (r *JobSetReconciler) createJobs(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs) error {
log := ctrl.LoggerFrom(ctx)

// If pod DNS hostnames are enabled, create a headless service for the JobSet
if dnsHostnamesEnabled(js) {
if err := r.createHeadlessSvcIfNotExist(ctx, js); err != nil {
return err
}
}

for _, rjob := range js.Spec.ReplicatedJobs {
jobs, err := constructJobsFromTemplate(js, &rjob, ownedJobs)
if err != nil {
return err
}

// If pod DNS hostnames are enabled, create a headless service per replicatedjob.
if dnsHostnamesEnabled(js) {
if err := r.createHeadlessSvcIfNotExist(ctx, js, &rjob); err != nil {
return err
}
}

for _, job := range jobs {
// Set jobset controller as owner of the job for garbage collection and reconcilation.
if err := ctrl.SetControllerReference(js, job, r.Scheme); err != nil {
Expand All @@ -383,11 +382,12 @@ func (r *JobSetReconciler) createJobs(ctx context.Context, js *jobset.JobSet, ow

// TODO: look into adopting service and updating the selector
// if it is not matching the job selector.
func (r *JobSetReconciler) createHeadlessSvcIfNotExist(ctx context.Context, js *jobset.JobSet, rjob *jobset.ReplicatedJob) error {
func (r *JobSetReconciler) createHeadlessSvcIfNotExist(ctx context.Context, js *jobset.JobSet) error {
log := ctrl.LoggerFrom(ctx)

// Check if service already exists. Default service name is <jobSetName>-<replicatedJobName> unless otherwise specified.
// If a subdomain is provided already, it should also be in the same namespace.
// Check if service already exists. The service name should match the subdomain specified in
// Spec.Network.Subdomain, with default of <jobSetName> set by the webhook.
// If the service doesn't exist in the same namespace, create it.
var headlessSvc corev1.Service
subdomain := GenSubdomain(js)
if err := r.Get(ctx, types.NamespacedName{Name: subdomain, Namespace: js.Namespace}, &headlessSvc); err != nil {
Expand All @@ -396,12 +396,10 @@ func (r *JobSetReconciler) createHeadlessSvcIfNotExist(ctx context.Context, js *
Name: subdomain,
Namespace: js.Namespace,
},
// TODO how to handle replicated job selectors here when more than one job share a service?
Spec: corev1.ServiceSpec{
ClusterIP: "None",
Selector: map[string]string{
jobset.JobSetNameKey: js.Name,
jobset.ReplicatedJobNameKey: rjob.Name,
jobset.JobSetNameKey: js.Name,
},
},
}
Expand Down
40 changes: 0 additions & 40 deletions test/integration/controller/jobset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,18 +352,6 @@ var _ = ginkgo.Describe("JobSet controller", func() {
},
},
}),
ginkgo.Entry("jobset with DNS hostnames enabled and shared network name should create 1 headless service and succeed when all jobs succeed", &testCase{
makeJobSet: sharedNetworkJobSet,
updates: []*update{
{
checkJobSetState: checkSharedService,
},
{
jobUpdateFn: completeAllJobs,
checkJobSetCondition: testutil.JobSetCompleted,
},
},
}),
ginkgo.Entry("succeeds from first run", &testCase{
makeJobSet: testJobSet,
updates: []*update{
Expand Down Expand Up @@ -811,17 +799,6 @@ func jobActive(job *batchv1.Job) bool {
return active
}

// Check one headless service across all replicated jobs
func checkSharedService(js *jobset.JobSet) {
gomega.Eventually(func() (int, error) {
var svcList corev1.ServiceList
if err := k8sClient.List(ctx, &svcList, client.InNamespace(js.Namespace)); err != nil {
return -1, err
}
return len(svcList.Items), nil
}).Should(gomega.Equal(1))
}

// 2 replicated jobs:
// - one with 1 replica
// - one with 3 replicas and DNS hostnames enabled
Expand All @@ -838,20 +815,3 @@ func testJobSet(ns *corev1.Namespace) *testing.JobSetWrapper {
Replicas(3).
Obj())
}

// 2 replicated jobs:
// - same as above, but with both having DNS enabled and a shared network
func sharedNetworkJobSet(ns *corev1.Namespace) *testing.JobSetWrapper {
return testing.MakeJobSet("shared-network-js", ns.Name).
NetworkSubdomain("shared-network").
EnableDNSHostnames(true).
SuccessPolicy(&jobset.SuccessPolicy{Operator: jobset.OperatorAll, TargetReplicatedJobs: []string{}}).
ReplicatedJob(testing.MakeReplicatedJob("replicated-job-a").
Job(testing.MakeJobTemplate("test-job-A", ns.Name).PodSpec(testing.TestPodSpec).Obj()).
Replicas(1).
Obj()).
ReplicatedJob(testing.MakeReplicatedJob("replicated-job-b").
Job(testing.MakeJobTemplate("test-job-B", ns.Name).PodSpec(testing.TestPodSpec).CompletionMode(batchv1.IndexedCompletion).Obj()).
Replicas(3).
Obj())
}

0 comments on commit 7f341b9

Please sign in to comment.