Skip to content

Commit

Permalink
Merge pull request #180 from slok/slok/refactor-windows-implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
slok committed Oct 9, 2021
2 parents efa09e0 + c41e2e3 commit 8e93f5e
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 54 deletions.
125 changes: 80 additions & 45 deletions internal/alert/alert.go
Expand Up @@ -62,42 +62,40 @@ type SLO struct {
}

func (g generator) GenerateMWMBAlerts(ctx context.Context, slo SLO) (*MWMBAlertGroup, error) {
if slo.TimeWindow != 30*24*time.Hour {
return nil, fmt.Errorf("only 30 day SLO time window is supported")
}
windowMeta := newDefaultWindowMetadata(slo.TimeWindow)

errorBudget := 100 - slo.Objective

group := MWMBAlertGroup{
PageQuick: MWMBAlert{
ID: fmt.Sprintf("%s-page-quick", slo.ID),
ShortWindow: windowPageQuickShort,
LongWindow: windowPageQuickLong,
BurnRateFactor: speedPageQuick,
ShortWindow: windowMeta.WindowPageQuickShort,
LongWindow: windowMeta.WindowPageQuickLong,
BurnRateFactor: windowMeta.GetSpeedPageQuick(),
ErrorBudget: errorBudget,
Severity: PageAlertSeverity,
},
PageSlow: MWMBAlert{
ID: fmt.Sprintf("%s-page-slow", slo.ID),
ShortWindow: windowPageSlowShort,
LongWindow: windowPageSlowLong,
BurnRateFactor: speedPageSlow,
ShortWindow: windowMeta.WindowPageSlowShort,
LongWindow: windowMeta.WindowPageSlowLong,
BurnRateFactor: windowMeta.GetSpeedPageSlow(),
ErrorBudget: errorBudget,
Severity: PageAlertSeverity,
},
TicketQuick: MWMBAlert{
ID: fmt.Sprintf("%s-ticket-quick", slo.ID),
ShortWindow: windowTicketQuickShort,
LongWindow: windowTicketQuickLong,
BurnRateFactor: speedTicketQuick,
ShortWindow: windowMeta.WindowTicketQuickShort,
LongWindow: windowMeta.WindowTicketQuickLong,
BurnRateFactor: windowMeta.GetSpeedTicketQuick(),
ErrorBudget: errorBudget,
Severity: TicketAlertSeverity,
},
TicketSlow: MWMBAlert{
ID: fmt.Sprintf("%s-ticket-slow", slo.ID),
ShortWindow: windowTicketSlowShort,
LongWindow: windowTicketSlowLong,
BurnRateFactor: speedTicketSlow,
ShortWindow: windowMeta.WindowTicketSlowShort,
LongWindow: windowMeta.WindowTicketSlowLong,
BurnRateFactor: windowMeta.GetSpeedTicketSlow(),
ErrorBudget: errorBudget,
Severity: TicketAlertSeverity,
},
Expand All @@ -106,40 +104,53 @@ func (g generator) GenerateMWMBAlerts(ctx context.Context, slo SLO) (*MWMBAlertG
return &group, nil
}

// From https://sre.google/workbook/alerting-on-slos/#recommended_parameters_for_an_slo_based_a table.
const (
// Time windows.
windowPageQuickShort = 5 * time.Minute
windowPageQuickLong = 1 * time.Hour
windowPageSlowShort = 30 * time.Minute
windowPageSlowLong = 6 * time.Hour
windowTicketQuickShort = 2 * time.Hour
windowTicketQuickLong = 1 * 24 * time.Hour
windowTicketSlowShort = 6 * time.Hour
windowTicketSlowLong = 3 * 24 * time.Hour

// Error budget percents for 30 day time window.
ErrBudgetPercentPageQuick30D = 2
ErrBudgetPercentPageSlow30D = 5
ErrBudgetPercentTicketQuick30D = 10
ErrBudgetPercentTicketSlow30D = 10
)
// WindowMetadata has the information required to calculate SLOs.
type WindowMetadata struct {
WindowPeriod time.Duration

// Alerting required windows.
// Its a matrix of values with:
// - Alert severity: ["page", "ticket"].
// - Measure period: ["long", "short"].
WindowPageQuickShort time.Duration
WindowPageQuickLong time.Duration
WindowPageSlowShort time.Duration
WindowPageSlowLong time.Duration
WindowTicketQuickShort time.Duration
WindowTicketQuickLong time.Duration
WindowTicketSlowShort time.Duration
WindowTicketSlowLong time.Duration

// Error budget percent consumed for a full time window.
// Google gives us some defaults in its SRE workbook that work correctly most of the times:
// - Page quick: 2%
// - Page slow: 5%
// - Ticket quick: 10%
// - Ticket slow: 10%
ErrorBudgetPercPageQuick float64
ErrorBudgetPercPageSlow float64
ErrorBudgetPercTicketQuick float64
ErrorBudgetPercTicketSlow float64
}

var (
// Error budget speeds based on a 30 day window, however once we have the factor (speed)
// the value can be used with any time window, that's why we calculate here.
// We could hardcode the factors but this way we know how are generated and we use it
// as as documention.
baseWindow = 30 * 24 * time.Hour
speedPageQuick = getBurnRateFactor(baseWindow, ErrBudgetPercentPageQuick30D, windowPageQuickLong) // Speed: 14.4.
speedPageSlow = getBurnRateFactor(baseWindow, ErrBudgetPercentPageSlow30D, windowPageSlowLong) // Speed: 6.
speedTicketQuick = getBurnRateFactor(baseWindow, ErrBudgetPercentTicketQuick30D, windowTicketQuickLong) // Speed: 3.
speedTicketSlow = getBurnRateFactor(baseWindow, ErrBudgetPercentTicketSlow30D, windowTicketSlowLong) // Speed: 1.
)
// Error budget speeds based on a full time window, however once we have the factor (speed)
// the value can be used with any time window.
func (w WindowMetadata) GetSpeedPageQuick() float64 {
return w.getBurnRateFactor(w.WindowPeriod, w.ErrorBudgetPercPageQuick, w.WindowPageQuickLong)
}
func (w WindowMetadata) GetSpeedPageSlow() float64 {
return w.getBurnRateFactor(w.WindowPeriod, w.ErrorBudgetPercPageSlow, w.WindowPageSlowLong)
}
func (w WindowMetadata) GetSpeedTicketQuick() float64 {
return w.getBurnRateFactor(w.WindowPeriod, w.ErrorBudgetPercTicketQuick, w.WindowTicketQuickLong)
}
func (w WindowMetadata) GetSpeedTicketSlow() float64 {
return w.getBurnRateFactor(w.WindowPeriod, w.ErrorBudgetPercTicketSlow, w.WindowTicketSlowLong)
}

// getBurnRateFactor calculates the burnRateFactor (speed) needed to consume all the error budget available percent
// in a specific time window taking into account the total time window.
func getBurnRateFactor(totalWindow time.Duration, errorBudgetPercent float64, consumptionWindow time.Duration) float64 {
func (w WindowMetadata) getBurnRateFactor(totalWindow time.Duration, errorBudgetPercent float64, consumptionWindow time.Duration) float64 {
// First get the total hours required to consume the % of the error budget in the total window.
hoursRequiredConsumption := errorBudgetPercent * totalWindow.Hours() / 100

Expand All @@ -149,3 +160,27 @@ func getBurnRateFactor(totalWindow time.Duration, errorBudgetPercent float64, co

return speed
}

// newDefaultWindowMetadata returns a common and safe to use window metadata, normally this works well
// with month based time windows like 28 day and 30 day. Is the most common kind of SLO based window metadata.
//
// From https://sre.google/workbook/alerting-on-slos/#recommended_parameters_for_an_slo_based_a table.
func newDefaultWindowMetadata(windowPeriod time.Duration) WindowMetadata {
return WindowMetadata{
WindowPeriod: windowPeriod,

WindowPageQuickShort: 5 * time.Minute,
WindowPageQuickLong: 1 * time.Hour,
WindowPageSlowShort: 30 * time.Minute,
WindowPageSlowLong: 6 * time.Hour,
WindowTicketQuickShort: 2 * time.Hour,
WindowTicketQuickLong: 1 * 24 * time.Hour,
WindowTicketSlowShort: 6 * time.Hour,
WindowTicketSlowLong: 3 * 24 * time.Hour,

ErrorBudgetPercPageQuick: 2,
ErrorBudgetPercPageSlow: 5,
ErrorBudgetPercTicketQuick: 10,
ErrorBudgetPercTicketSlow: 10,
}
}
52 changes: 43 additions & 9 deletions internal/alert/alert_test.go
Expand Up @@ -16,15 +16,6 @@ func TestGenerateMWMBAlerts(t *testing.T) {
expAlerts *alert.MWMBAlertGroup
expErr bool
}{
"Generating alerts different to 30 day time window should fail.": {
slo: alert.SLO{
ID: "test",
TimeWindow: 31 * 24 * time.Hour,
Objective: 99.9,
},
expErr: true,
},

"Generating a 30 day time window alerts should generate the alerts correctly.": {
slo: alert.SLO{
ID: "test",
Expand Down Expand Up @@ -67,6 +58,49 @@ func TestGenerateMWMBAlerts(t *testing.T) {
},
},
},

"Generating a 28 day time window alerts should generate the alerts correctly.": {
slo: alert.SLO{
ID: "test",
TimeWindow: 28 * 24 * time.Hour,
Objective: 99.9,
},
expAlerts: &alert.MWMBAlertGroup{
PageQuick: alert.MWMBAlert{
ID: "test-page-quick",
ShortWindow: 5 * time.Minute,
LongWindow: 1 * time.Hour,
BurnRateFactor: 13.44,
ErrorBudget: 0.09999999999999432,
Severity: alert.PageAlertSeverity,
},
PageSlow: alert.MWMBAlert{
ID: "test-page-slow",
ShortWindow: 30 * time.Minute,
LongWindow: 6 * time.Hour,
BurnRateFactor: 5.6000000000000005,
ErrorBudget: 0.09999999999999432,
Severity: alert.PageAlertSeverity,
},

TicketQuick: alert.MWMBAlert{
ID: "test-ticket-quick",
ShortWindow: 2 * time.Hour,
LongWindow: 1 * 24 * time.Hour,
BurnRateFactor: 2.8000000000000003,
ErrorBudget: 0.09999999999999432,
Severity: alert.TicketAlertSeverity,
},
TicketSlow: alert.MWMBAlert{
ID: "test-ticket-slow",
ShortWindow: 6 * time.Hour,
LongWindow: 3 * 24 * time.Hour,
BurnRateFactor: 0.9333333333333333,
ErrorBudget: 0.09999999999999432,
Severity: alert.TicketAlertSeverity,
},
},
},
}

for name, test := range tests {
Expand Down

0 comments on commit 8e93f5e

Please sign in to comment.