Skip to content
This repository has been archived by the owner on Apr 29, 2020. It is now read-only.

Disable daemon set deletion behaviour #754

Merged
merged 5 commits into from
Feb 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 3 additions & 19 deletions pkg/ds/daemon_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os/user"
"time"

"github.com/Sirupsen/logrus"
"github.com/square/p2/pkg/ds/fields"
"github.com/square/p2/pkg/health"
"github.com/square/p2/pkg/health/checker"
Expand Down Expand Up @@ -275,27 +276,10 @@ func (ds *daemonSet) WatchDesires(

case deleteDS, ok := <-deletedCh:
if !ok {
// channel closed
return
}
ds.logger.NoFields().Infof("Received daemon set delete signal: %v", deleteDS)
if deleteDS == nil {
ds.logger.Errorf("Unexpected nil daemon set during delete")
return
}
if ds.ID() != deleteDS.ID {
err = util.Errorf("Expected uuid to be the same, expected '%v', got '%v'", ds.ID(), deleteDS.ID)
continue
}

err = ds.clearPods()
if err != nil {
err = util.Errorf("Unable to clear pods from intent tree: %v", err)
select {
case errCh <- err:
case <-quitCh:
}
}
// Deleting a daemon sets has no effect
ds.logger.WithFields(logrus.Fields{"id": deleteDS, "node_selector": ds.NodeSelector.String()}).Infof("Daemon Set Deletion is disabled and has no effect. You may want to clean this up manually.")
return

case labeledChanges, ok := <-nodesChangedCh:
Expand Down
83 changes: 46 additions & 37 deletions pkg/ds/daemon_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,6 @@ const (
testDSRetryInterval = time.Duration(1000 * time.Millisecond)
)

func scheduledPods(t *testing.T, ds *daemonSet) []labels.Labeled {
selector := klabels.Everything().Add(DSIDLabel, klabels.EqualsOperator, []string{ds.ID().String()})
labeled, err := ds.applicator.GetMatches(selector, labels.POD, false)
Assert(t).IsNil(err, "expected no error matching pods")
return labeled
}

func waitForNodes(
t *testing.T,
ds DaemonSet,
Expand Down Expand Up @@ -179,11 +172,11 @@ func TestSchedule(t *testing.T) {
false,
).(*daemonSet)

scheduled := scheduledPods(t, ds)
Assert(t).AreEqual(len(scheduled), 0, "expected no pods to have been labeled")
labeled := labeledPods(t, ds)
Assert(t).AreEqual(len(labeled), 0, "expected no pods to have been labeled")

err = waitForPodsInIntent(consulStore, 0)
Assert(t).IsNil(err, "Unexpected number of pods scheduled")
Assert(t).IsNil(err, "Unexpected number of pods labeled")

err = applicator.SetLabel(labels.NODE, "node1", "nodeQuality", "bad")
Assert(t).IsNil(err, "expected no error labeling node1")
Expand All @@ -204,18 +197,18 @@ func TestSchedule(t *testing.T) {
dsChangesErrCh := watchDSChanges(ds, dsStore, quitCh, updatedCh, deletedCh)

//
// Verify that the pod has been scheduled
// Verify that the pod has been labeled
//
numNodes := waitForNodes(t, ds, 1, desiresErrCh, dsChangesErrCh)
Assert(t).AreEqual(numNodes, 1, "took too long to schedule")

scheduled = scheduledPods(t, ds)
Assert(t).AreEqual(len(scheduled), 1, "expected a node to have been labeled")
Assert(t).AreEqual(scheduled[0].ID, "node2/testPod", "expected node labeled with the daemon set's id")
labeled = labeledPods(t, ds)
Assert(t).AreEqual(len(labeled), 1, "expected a node to have been labeled")
Assert(t).AreEqual(labeled[0].ID, "node2/testPod", "expected node labeled with the daemon set's id")

// Verify that the scheduled pod is correct
// Verify that the labeled pod is correct
err = waitForSpecificPod(consulStore, "node2", types.PodID("testPod"))
Assert(t).IsNil(err, "Unexpected pod scheduled")
Assert(t).IsNil(err, "Unexpected pod labeled")

//
// Add 10 good nodes and 10 bad nodes then verify
Expand All @@ -236,8 +229,8 @@ func TestSchedule(t *testing.T) {
numNodes = waitForNodes(t, ds, 11, desiresErrCh, dsChangesErrCh)
Assert(t).AreEqual(numNodes, 11, "took too long to schedule")

scheduled = scheduledPods(t, ds)
Assert(t).AreEqual(len(scheduled), 11, "expected a lot of nodes to have been labeled")
labeled = labeledPods(t, ds)
Assert(t).AreEqual(len(labeled), 11, "expected a lot of nodes to have been labeled")

//
// Add a node with the labels nodeQuality=good and cherry=pick
Expand All @@ -264,9 +257,9 @@ func TestSchedule(t *testing.T) {
numNodes = waitForNodes(t, ds, 1, desiresErrCh, dsChangesErrCh)
Assert(t).AreEqual(numNodes, 1, "took too long to schedule")

// Verify that the scheduled pod is correct
// Verify that the labeled pod is correct
err = waitForSpecificPod(consulStore, "nodeOk", types.PodID("testPod"))
Assert(t).IsNil(err, "Unexpected pod scheduled")
Assert(t).IsNil(err, "Unexpected pod labeled")

//
// Disabling the daemon set and making a change should not do anything
Expand Down Expand Up @@ -295,25 +288,29 @@ func TestSchedule(t *testing.T) {
Assert(t).IsNil(err, "Unxpected error trying to mutate daemon set")

// 11 good nodes 11 bad nodes, and 1 good cherry picked node = 23 nodes
numNodes = waitForNodes(t, ds, 23, desiresErrCh, dsChangesErrCh)
Assert(t).AreEqual(numNodes, 23, "took too long to schedule")
expectedNodes := 23
numNodes = waitForNodes(t, ds, expectedNodes, desiresErrCh, dsChangesErrCh)
Assert(t).AreEqual(numNodes, expectedNodes, "took too long to schedule")

//
// Deleting the daemon set should unschedule all of its nodes
//
beforeDeletePods, _, err := scheduledPods(consulStore)
Assert(t).IsNil(err, "Unable to list intent/")
ds.logger.NoFields().Info("Deleting daemon set...")
err = dsStore.Delete(ds.ID())
if err != nil {
t.Fatalf("Unable to delete daemon set: %v", err)
}
numNodes = waitForNodes(t, ds, 0, desiresErrCh, dsChangesErrCh)
Assert(t).AreEqual(numNodes, 0, "took too long to unschedule")
// behavior change: Deleting a daemon set will no longer unschedule its pods (for now)
numNodes = waitForNodes(t, ds, 12, desiresErrCh, dsChangesErrCh)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hey hold on a second. This line waits for the number of nodes to decrease to 12 (spoiler, it never will)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't cause any test failures of course, but it adds 10 seconds to each test.

Then again, you may say that's a feature: We make sure that no pods are deleted for ten seconds.

Assert(t).AreEqual(numNodes, expectedNodes, "Unexpected number of nodes labeled")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and then this line checks that the number of nodes is 23, not 12


scheduled = scheduledPods(t, ds)
Assert(t).AreEqual(len(scheduled), 0, "expected all nodes to have been unlabeled")
labeled = labeledPods(t, ds)
Assert(t).AreEqual(len(labeled), expectedNodes, "Expected no nodes to be unlabeled")

err = waitForPodsInIntent(consulStore, 0)
Assert(t).IsNil(err, "Unexpected number of pods scheduled")
err = waitForPodsInIntent(consulStore, len(beforeDeletePods))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this is what's causing test flakes - we'd like it to be 12 (Right?), but sometimes tests expect it to be 1, presumably because the scheduledPods call doesn't get all of them.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's probably it yep

Assert(t).IsNil(err, "Unexpected number of pods labeled")
}

func TestPublishToReplication(t *testing.T) {
Expand Down Expand Up @@ -365,10 +362,10 @@ func TestPublishToReplication(t *testing.T) {
false,
).(*daemonSet)

scheduled := scheduledPods(t, ds)
Assert(t).AreEqual(len(scheduled), 0, "expected no pods to have been labeled")
labeled := labeledPods(t, ds)
Assert(t).AreEqual(len(labeled), 0, "expected no pods to have been labeled")
err = waitForPodsInIntent(consulStore, 0)
Assert(t).IsNil(err, "Unexpected number of pods scheduled")
Assert(t).IsNil(err, "Unexpected number of pods labeled")

err = applicator.SetLabel(labels.NODE, "node1", "nodeQuality", "good")
Assert(t).IsNil(err, "expected no error labeling node1")
Expand All @@ -389,13 +386,13 @@ func TestPublishToReplication(t *testing.T) {
dsChangesErrCh := watchDSChanges(ds, dsStore, quitCh, updatedCh, deletedCh)

//
// Verify that 2 pods have been scheduled
// Verify that 2 pods have been labeled
//
numNodes := waitForNodes(t, ds, 2, desiresErrCh, dsChangesErrCh)
Assert(t).AreEqual(numNodes, 2, "took too long to schedule")

scheduled = scheduledPods(t, ds)
Assert(t).AreEqual(len(scheduled), 2, "expected a node to have been labeled")
labeled = labeledPods(t, ds)
Assert(t).AreEqual(len(labeled), 2, "expected a node to have been labeled")

// Mutate the daemon set so that no nodes are eligible.
// The mutation shouldn't produce an error, but the daemon set should when attempting to meet.
Expand All @@ -414,7 +411,7 @@ func TestPublishToReplication(t *testing.T) {
Assert(t).IsNotNil(err, "Unexpectedly nil error when no nodes are eligible")
}
numNodes = waitForNodes(t, ds, 2, desiresErrCh, dsChangesErrCh)
Assert(t).AreEqual(numNodes, 2, "unexpectedly unscheduled")
Assert(t).AreEqual(numNodes, 2, "unexpectedly unlabeled")
}

// Polls for the store to have the same number of pods as the argument
Expand All @@ -425,6 +422,7 @@ func waitForPodsInIntent(consulStore *consultest.FakePodStore, numPodsExpected i
return util.Errorf("Unable to get all pods from pod store: %v", err)
}
if len(manifestResults) != numPodsExpected {
fmt.Printf("%d : %v\n\n", len(manifestResults), manifestResults)
return util.Errorf(
"Expected no manifests to be scheduled, got '%v' manifests, expected '%v'",
len(manifestResults),
Expand All @@ -447,10 +445,10 @@ func waitForSpecificPod(consulStore *consultest.FakePodStore, nodeName types.Nod
return util.Errorf("expected a manifest in the intent tree")
}
if manifestResults[0].PodLocation.Node != nodeName {
return util.Errorf("expected manifest scheduled on the right node")
return util.Errorf("expected manifest labeled on the right node")
}
if manifestResults[0].PodLocation.PodID != podID {
return util.Errorf("expected manifest scheduled with correct pod ID")
return util.Errorf("expected manifest labeled with correct pod ID")
}
if manifestResults[0].Manifest.ID() != podID {
return util.Errorf("expected manifest with correct ID")
Expand All @@ -459,3 +457,14 @@ func waitForSpecificPod(consulStore *consultest.FakePodStore, nodeName types.Nod
}
return waitForCondition(condition)
}

func labeledPods(t *testing.T, ds *daemonSet) []labels.Labeled {
selector := klabels.Everything().Add(DSIDLabel, klabels.EqualsOperator, []string{ds.ID().String()})
labeled, err := ds.applicator.GetMatches(selector, labels.POD, false)
Assert(t).IsNil(err, "expected no error matching pods")
return labeled
}

func scheduledPods(consulStore *consultest.FakePodStore) ([]consul.ManifestResult, time.Duration, error) {
return consulStore.AllPods(consul.INTENT_TREE)
}
14 changes: 8 additions & 6 deletions pkg/ds/farm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -506,14 +506,15 @@ func TestFarmSchedule(t *testing.T) {
dsID = labeled.Labels.Get(DSIDLabel)
Assert(t).AreEqual(anotherDSData.ID.String(), dsID, "Unexpected dsID labeled")

dsStore.Delete(anotherDSData.ID)
err = dsStore.Delete(anotherDSData.ID)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that was a line I added?

yeah it was.

I am sorry. The errors are completely bogus because I didn't assign too...

Assert(t).IsNil(err, "Expected no error deleting daemon set")
err = waitForDelete(dsf, anotherDSData.ID)
Assert(t).IsNil(err, "Expected daemon set to be deleted in farm")

// Verify node3 is unscheduled
labeled, err = waitForPodLabel(applicator, false, "node3/testPod")
Assert(t).IsNil(err, "Expected pod not to have a dsID label")
// behavior change: Daemon Set deletions do not delete their pods (for now)
labeled, err = waitForPodLabel(applicator, true, "node3/testPod")
Assert(t).IsNil(err, "Expected pod to have a dsID label")
}

func TestCleanupPods(t *testing.T) {
Expand Down Expand Up @@ -903,12 +904,13 @@ func TestMultipleFarms(t *testing.T) {
dsID = labeled.Labels.Get(DSIDLabel)
Assert(t).AreEqual(anotherDSData.ID.String(), dsID, "Unexpected dsID labeled")

dsStore.Delete(anotherDSData.ID)
err = dsStore.Delete(anotherDSData.ID)
Assert(t).IsNil(err, "Expected no error deleting daemon set")

// Verify node3 is unscheduled
labeled, err = waitForPodLabel(applicator, false, "node3/testPod")
Assert(t).IsNil(err, "Expected pod not to have a dsID label")
// behavior change: Daemon Set deletions do not delete their pods (for now)
labeled, err = waitForPodLabel(applicator, true, "node3/testPod")
Assert(t).IsNil(err, "Expected pod to have a dsID label")
}

func waitForPodLabel(applicator labels.Applicator, hasDSIDLabel bool, podPath string) (labels.Labeled, error) {
Expand Down