@@ -102,6 +102,10 @@ func (c *BackupSessionController) runBackupSessionController(backupConfiguration
102
102
}
103
103
c .bsQueue .Run (stopCh )
104
104
105
+ // controller has started successfully. send successful backup setup metrics
106
+ log .Infoln ("Started BackupSession controller........" )
107
+ c .handleBackupSetupSuccess (backupConfiguration )
108
+
105
109
// wait until stop signal is sent.
106
110
<- stopCh
107
111
return nil
@@ -149,44 +153,54 @@ func (c *BackupSessionController) processBackupSession(key string) error {
149
153
backupSession := obj .(* api_v1beta1.BackupSession )
150
154
glog .Infof ("Sync/Add/Update for Backup Session %s" , backupSession .GetName ())
151
155
152
- // get respective BackupConfiguration for BackupSession
153
- backupConfiguration , err := c .StashClient .StashV1beta1 ().BackupConfigurations (backupSession .Namespace ).Get (
154
- backupSession .Spec .BackupConfiguration .Name ,
155
- metav1.GetOptions {},
156
- )
156
+ err := c .startBackupProcess (backupSession )
157
157
if err != nil {
158
- return fmt .Errorf ("can't get BackupConfiguration for BackupSession %s/%s, reason: %s" , backupSession .Namespace , backupSession .Name , err )
158
+ e2 := c .handleBackupFailure (backupSession , err )
159
+ err = errors .NewAggregate ([]error {err , e2 })
160
+ // log failure. don't fail the container as it may interrupt user's service
161
+ log .Infoln ("failed to complete backup. Reason: " , err .Error ())
159
162
}
163
+ }
164
+ return nil
165
+ }
160
166
161
- // skip if BackupConfiguration paused
162
- if backupConfiguration .Spec .Paused {
163
- log .Infof ("Skipping processing BackupSession %s/%s. Reason: Backup Configuration is paused." , backupSession .Namespace , backupSession .Name )
164
- return nil
165
- }
167
+ func (c * BackupSessionController ) startBackupProcess (backupSession * api_v1beta1.BackupSession ) error {
168
+ // get respective BackupConfiguration for BackupSession
169
+ backupConfiguration , err := c .StashClient .StashV1beta1 ().BackupConfigurations (backupSession .Namespace ).Get (
170
+ backupSession .Spec .BackupConfiguration .Name ,
171
+ metav1.GetOptions {},
172
+ )
173
+ if err != nil {
174
+ return fmt .Errorf ("can't get BackupConfiguration for BackupSession %s/%s, reason: %s" , backupSession .Namespace , backupSession .Name , err )
175
+ }
166
176
167
- host , err := util .GetHostName (backupConfiguration .Spec .Target )
168
- if err != nil {
169
- return err
170
- }
177
+ // skip if BackupConfiguration paused
178
+ if backupConfiguration .Spec .Paused {
179
+ log .Infof ("Skipping processing BackupSession %s/%s. Reason: Backup Configuration is paused." , backupSession .Namespace , backupSession .Name )
180
+ return nil
181
+ }
171
182
172
- // if BackupSession already has been processed for this host then skip further processing
173
- if c .isBackupTakenForThisHost (backupSession , host ) {
174
- log .Infof ("Skip processing BackupSession %s/%s. Reason: BackupSession has been processed already for host %q\n " , backupSession .Namespace , backupSession .Name , host )
175
- return nil
176
- }
183
+ host , err := util .GetHostName (backupConfiguration .Spec .Target )
184
+ if err != nil {
185
+ return err
186
+ }
177
187
178
- // For Deployment, ReplicaSet and ReplicationController only leader pod is running this controller so no problem with restic repo lock.
179
- // For StatefulSet and DaemonSet all pods are running this controller and all will try to backup simultaneously. But, restic repository can be
180
- // locked by only one pod. So, we need a leader election to determine who will take backup first. Once backup is complete, the leader pod will
181
- // step down from leadership so that another replica can acquire leadership and start taking backup.
182
- switch backupConfiguration .Spec .Target .Ref .Kind {
183
- case apis .KindDeployment , apis .KindReplicaSet , apis .KindReplicationController , apis .KindDeploymentConfig :
184
- return c .backup (backupSession , backupConfiguration )
185
- default :
186
- return c .electBackupLeader (backupSession , backupConfiguration )
187
- }
188
+ // if BackupSession already has been processed for this host then skip further processing
189
+ if c .isBackupTakenForThisHost (backupSession , host ) {
190
+ log .Infof ("Skip processing BackupSession %s/%s. Reason: BackupSession has been processed already for host %q\n " , backupSession .Namespace , backupSession .Name , host )
191
+ return nil
192
+ }
193
+
194
+ // For Deployment, ReplicaSet and ReplicationController only leader pod is running this controller so no problem with restic repo lock.
195
+ // For StatefulSet and DaemonSet all pods are running this controller and all will try to backup simultaneously. But, restic repository can be
196
+ // locked by only one pod. So, we need a leader election to determine who will take backup first. Once backup is complete, the leader pod will
197
+ // step down from leadership so that another replica can acquire leadership and start taking backup.
198
+ switch backupConfiguration .Spec .Target .Ref .Kind {
199
+ case apis .KindDeployment , apis .KindReplicaSet , apis .KindReplicationController , apis .KindDeploymentConfig :
200
+ return c .backup (backupSession , backupConfiguration )
201
+ default :
202
+ return c .electBackupLeader (backupSession , backupConfiguration )
188
203
}
189
- return nil
190
204
}
191
205
192
206
func (c * BackupSessionController ) backup (backupSession * api_v1beta1.BackupSession , backupConfiguration * api_v1beta1.BackupConfiguration ) error {
@@ -295,18 +309,14 @@ func (c *BackupSessionController) electLeaderPod(backupConfiguration *api_v1beta
295
309
RetryPeriod : 2 * time .Second ,
296
310
Callbacks : leaderelection.LeaderCallbacks {
297
311
OnStartedLeading : func (ctx context.Context ) {
298
- log .Infoln ("Got leadership, preparing starting BackupSession controller" )
312
+ log .Infoln ("Got leadership, starting BackupSession controller" )
299
313
// this pod is now leader. run BackupSession controller.
300
314
err := c .runBackupSessionController (backupConfiguration , stopCh )
301
315
if err != nil {
302
- e2 := c .HandleBackupFailure (err )
303
- if e2 != nil {
304
- err = errors .NewAggregate ([]error {err , e2 })
305
- }
316
+ // send failure metric and fail the container so that it retry to setup
317
+ c .HandleBackupSetupFailure (err )
306
318
// step down from leadership so that other replicas can try to start BackupSession controller
307
319
cancel ()
308
- // fail the container so that it restart and re-try this process.
309
- log .Fatalln ("failed to start BackupSession controller. Reason: " , err .Error ())
310
320
}
311
321
},
312
322
OnStoppedLeading : func () {
@@ -354,14 +364,12 @@ func (c *BackupSessionController) electBackupLeader(backupSession *api_v1beta1.B
354
364
// run backup process
355
365
err := c .backup (backupSession , backupConfiguration )
356
366
if err != nil {
357
- e2 := c .HandleBackupFailure (err )
358
- if e2 != nil {
359
- err = errors .NewAggregate ([]error {err , e2 })
360
- }
367
+ // send failure metrics and update BackupSession status
368
+ err = c .handleBackupFailure (backupSession , err )
361
369
// step down from leadership so that other replicas can start backup
362
370
cancel ()
363
- // fail the container so that it restart and re-try to backup
364
- log .Fatalln ("failed to complete backup. Reason: " , err .Error ())
371
+ // log failure. don't fail the container as it may interrupt user's service
372
+ log .Warningln ("failed to complete backup. Reason: " , err .Error ())
365
373
}
366
374
// backup process is complete. now, step down from leadership so that other replicas can start
367
375
cancel ()
@@ -374,12 +382,7 @@ func (c *BackupSessionController) electBackupLeader(backupSession *api_v1beta1.B
374
382
return nil
375
383
}
376
384
377
- func (c * BackupSessionController ) HandleBackupFailure (backupErr error ) error {
378
- backupSession , err := c .StashClient .StashV1beta1 ().BackupSessions (c .Namespace ).Get (c .BackupConfigurationName , metav1.GetOptions {})
379
- if err != nil {
380
- return err
381
- }
382
-
385
+ func (c * BackupSessionController ) handleBackupFailure (backupSession * api_v1beta1.BackupSession , backupErr error ) error {
383
386
backupConfiguration , err := c .StashClient .StashV1beta1 ().BackupConfigurations (backupSession .Namespace ).Get (backupSession .Spec .BackupConfiguration .Name , metav1.GetOptions {})
384
387
if err != nil {
385
388
return err
@@ -412,6 +415,41 @@ func (c *BackupSessionController) HandleBackupFailure(backupErr error) error {
412
415
return nil
413
416
}
414
417
418
+ func (c * BackupSessionController ) HandleBackupSetupFailure (setupErr error ) {
419
+ backupConfiguration , err := c .StashClient .StashV1beta1 ().BackupConfigurations (c .Namespace ).Get (c .BackupConfigurationName , metav1.GetOptions {})
420
+ if err != nil {
421
+ e2 := errors .NewAggregate ([]error {setupErr , err })
422
+ log .Fatalln ("failed to setup backup process. Reason: " , e2 .Error ())
423
+ }
424
+ c .Metrics .Labels = append (c .Metrics .Labels , fmt .Sprintf ("BackupConfiguration=%s" , backupConfiguration .Name ))
425
+ if backupConfiguration .Spec .Target != nil {
426
+ c .Metrics .Labels = append (c .Metrics .Labels , fmt .Sprintf ("kind=%s" , backupConfiguration .Spec .Target .Ref .Kind ))
427
+ c .Metrics .Labels = append (c .Metrics .Labels , fmt .Sprintf ("name=%s" , backupConfiguration .Spec .Target .Ref .Name ))
428
+ }
429
+ // send prometheus metrics
430
+ if c .Metrics .Enabled {
431
+ err := restic .HandleBackupSetupMetrics (c .Metrics , setupErr )
432
+ setupErr = errors .NewAggregate ([]error {setupErr , err })
433
+ }
434
+ // fail the container so that it restart and re-try this process.
435
+ log .Fatalln ("failed to setup backup process. Reason: " , setupErr .Error ())
436
+ }
437
+
438
+ func (c * BackupSessionController ) handleBackupSetupSuccess (backupConfiguration * api_v1beta1.BackupConfiguration ) {
439
+ c .Metrics .Labels = append (c .Metrics .Labels , fmt .Sprintf ("BackupConfiguration=%s" , backupConfiguration .Name ))
440
+ if backupConfiguration .Spec .Target != nil {
441
+ c .Metrics .Labels = append (c .Metrics .Labels , fmt .Sprintf ("kind=%s" , backupConfiguration .Spec .Target .Ref .Kind ))
442
+ c .Metrics .Labels = append (c .Metrics .Labels , fmt .Sprintf ("name=%s" , backupConfiguration .Spec .Target .Ref .Name ))
443
+ }
444
+ // send prometheus metrics
445
+ if c .Metrics .Enabled {
446
+ err := restic .HandleBackupSetupMetrics (c .Metrics , nil )
447
+ if err != nil {
448
+ log .Warningln ("failed to send prometheus metrics. Reason: " , err .Error ())
449
+ }
450
+ }
451
+ }
452
+
415
453
func (c * BackupSessionController ) writeBackupFailureEvent (backupSession * api_v1beta1.BackupSession , host string , err error ) {
416
454
// write failure event
417
455
ref , rerr := reference .GetReference (stash_scheme .Scheme , backupSession )
0 commit comments