Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added code to avoid aggresive update due to rolling update issue on any node. #162

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion cmd/calico-node/main.go
Expand Up @@ -17,6 +17,7 @@ import (
"flag"
"fmt"
"os"
"time"

confdConfig "github.com/kelseyhightower/confd/pkg/config"
confd "github.com/kelseyhightower/confd/pkg/run"
Expand Down Expand Up @@ -44,6 +45,9 @@ var birdReady = flagSet.Bool("bird-ready", false, "Run BIRD readiness checks")
var bird6Ready = flagSet.Bool("bird6-ready", false, "Run BIRD6 readiness checks")
var felixReady = flagSet.Bool("felix-ready", false, "Run felix readiness checks")

// thresholdTime is introduced for bird readiness check. Default value is 30 sec.
var thresholdTime = flagSet.Duration("threshold-time", 30*time.Second, "Threshold time for bird readiness")

// confd flags
var runConfd = flagSet.Bool("confd", false, "Run confd")
var confdRunOnce = flagSet.Bool("confd-run-once", false, "Run confd in oneshot mode")
Expand All @@ -63,6 +67,9 @@ func main() {
os.Exit(1)
}

// Threshold time for bgp peering
fmt.Println("Threshold time for bird readiness check: ", *thresholdTime)

// Perform some validation on the parsed flags. Only one of the following may be
// specified at a time.
onlyOne := []*bool{version, runFelix, runStartup, runConfd}
Expand All @@ -80,7 +87,7 @@ func main() {

// If any of the readienss options are provided, check readiness.
if *birdReady || *bird6Ready || *felixReady {
readiness.Run(*birdReady, *bird6Ready, *felixReady)
readiness.Run(*birdReady, *bird6Ready, *felixReady, *thresholdTime)
os.Exit(0)
}

Expand Down
52 changes: 37 additions & 15 deletions pkg/readiness/readiness.go
Expand Up @@ -28,8 +28,7 @@ import (

const felixReadinessEp = "http://localhost:9099/readiness"

func Run(bird, bird6, felix bool) {

func Run(bird, bird6, felix bool, thresholdTime time.Duration) {
if !bird && !felix && !bird6 {
fmt.Printf("calico/node readiness check error: must specify at least one of -bird, -bird6, or -felix")
os.Exit(1)
Expand All @@ -43,14 +42,14 @@ func Run(bird, bird6, felix bool) {
}

if bird {
if err := checkBIRDReady("4"); err != nil {
if err := checkBIRDReady("4", thresholdTime); err != nil {
fmt.Printf("calico/node is not ready: BIRD is not ready: %+v", err)
os.Exit(1)
}
}

if bird6 {
if err := checkBIRDReady("6"); err != nil {
if err := checkBIRDReady("6", thresholdTime); err != nil {
fmt.Printf("calico/node is not ready: BIRD6 is not ready: %+v", err)
os.Exit(1)
}
Expand All @@ -60,7 +59,13 @@ func Run(bird, bird6, felix bool) {
// checkBIRDReady checks if BIRD is ready by connecting to the BIRD
// socket to gather all BGP peer connection status, and overall graceful
// restart status.
func checkBIRDReady(ipv string) error {
func checkBIRDReady(ipv string, thresholdTime time.Duration) error {
// Stat nodename file to get the modified time of the file.
nodenameFileStat, err := os.Stat("/var/lib/calico/nodename")
if err != nil {
return fmt.Errorf("Failed to stat() nodename file: %v", err)
}

// Check for unestablished peers
peers, err := bird.GetPeers(ipv)
log.Debugf("peers: %v", peers)
Expand All @@ -69,21 +74,38 @@ func checkBIRDReady(ipv string) error {
}

s := []string{}

// numEstablishedPeer keeps count of number of peers with bgp state established.
numEstablishedPeer := 0

for _, peer := range peers {
if peer.BGPState != "Established" {
if peer.BGPState == "Established" {
numEstablishedPeer += 1
} else {
s = append(s, peer.PeerIP)
}
}
if len(s) > 0 {
return fmt.Errorf("BGP not established with %+v", strings.Join(s, ","))
}
log.Infof("Number of node(s) with BGP peering established = %v", numEstablishedPeer)

// Check for GR
gr, err := bird.GRInProgress(ipv)
if err != nil {
return err
} else if gr {
return errors.New("graceful restart in progress")
if time.Since(nodenameFileStat.ModTime()) < thresholdTime {
if len(s) > 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think this needs a comment to explain why we're doing a different check here.

// When we first start up, only report ready if all our peerings are established.
// This prevents rolling update from proceeding until BGP is back up.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged and added.

// When we first start up, only report ready if all our peerings are established.
// This prevents rolling update from proceeding until BGP is back up.
return fmt.Errorf("BGP not established with %+v", strings.Join(s, ","))
}
// Check for GR
gr, err := bird.GRInProgress(ipv)
if err != nil {
return err
} else if gr {
return errors.New("graceful restart in progress")
}
} else if numEstablishedPeer > 0 {
// After a while, only require a single peering to be up. This prevents the whole mesh
// from reporting not-ready if some nodes go down.
log.Debugf("There exist(s) %v calico node(s) with BGP peering established.", numEstablishedPeer)
} else {
return fmt.Errorf("BGP not established with %+v", strings.Join(s, ","))
}

return nil
Expand Down