Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,29 @@ jobs:
- run: make assets
- run: make apiv2
- run: git diff --exit-code
- run: make assets-tarball
- uses: prometheus/promci-artifacts/save@f9a587dbc0b2c78a0c54f8ad1cde71ea29a4b76f # v0.1.0
with:
directory: .tarballs
- uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v4.6.2
with:
name: ui-dist
path: ui/app/dist
include-hidden-files: true

build:
name: Build Alertmanager for common architectures
needs: [test_frontend]
runs-on: ubuntu-latest
strategy:
matrix:
thread: [0, 1, 2]
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4.0.0
- uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: ui-dist
path: ui/app/dist
- uses: prometheus/promci/build@42c3c84c865e5c1ab78543b929f7341eb2ef6123 # v0.6.1
with:
promu_opts: "-p linux/amd64 -p windows/amd64 -p linux/arm64 -p darwin/amd64 -p darwin/arm64 -p linux/386"
Expand All @@ -40,6 +54,7 @@ jobs:
test:
name: Test
runs-on: ubuntu-latest
needs: [test_frontend]
# Whenever the Go version is updated here, .promu.yml
# should also be updated.
container:
Expand All @@ -58,14 +73,10 @@ jobs:
EMAIL_AUTH_CONFIG: testdata/auth.yml
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- id: cache-paths
run: echo "npm-cache=$(npm config get cache)" >> $GITHUB_OUTPUT
- uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
- uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
path: ${{ steps.cache-paths.outputs.npm-cache }}
key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-
name: ui-dist
path: ui/app/dist
- uses: prometheus/promci-setup@5af30ba8c199a91d6c04ebdc3c48e630e355f62d # v0.1.0
- run: make
- run: git diff --exit-code
4 changes: 4 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ jobs:
needs: ci
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: ui-dist
path: ui/app/dist
- uses: prometheus/promci/build@42c3c84c865e5c1ab78543b929f7341eb2ef6123 # v0.6.1
with:
parallelism: 12
Expand Down
2 changes: 1 addition & 1 deletion .promu.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
go:
# Whenever the Go version is updated here,
# .circle/config.yml should also be updated.
# .github/workflows/ci.yml and ui/app/Makefile should also be updated.
version: 1.26
repository:
path: github.com/prometheus/alertmanager
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
* [FEATURE] ...
* [ENHANCEMENT] ...

## 0.32.1-rc.0 / 2026-04-27

* [BUGFIX] dispatcher: Fix issue with dispatching to a contended route. #5179
* [BUGFIX] ui: Provide prebuilt ui assets in release. #5191

## 0.32.0 / 2026-04-10

* [CHANGE] `go get github.com/prometheus/alertmanager/ui` will now fail as compiled UI assets are no longer checked into the repository. Downstream builds that rely on these assets being present in the source tree must now build the UI from source. #5113
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ lint: ui-elm common-lint
.PHONY: assets
assets: $(FRONTEND_DIR)/src/Data ui-elm template/email.tmpl

.PHONY: assets-tarball
assets-tarball: ui-elm
mkdir -p .tarballs
tar czf ".tarballs/alertmanager-web-ui-$(file <VERSION).tar.gz" -C ui/app dist

.PHONY: ui-elm
ui-elm:
cd $(FRONTEND_DIR) && $(MAKE) build
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.32.0
0.32.1-rc.0
23 changes: 20 additions & 3 deletions dispatch/dispatch.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,11 @@ var tracer = tracing.NewTracer("github.com/prometheus/alertmanager/dispatch")

// DispatcherMetrics represents metrics associated to a dispatcher.
type DispatcherMetrics struct {
aggrGroups prometheus.Gauge
processingDuration prometheus.Summary
aggrGroupLimitReached prometheus.Counter
aggrGroups prometheus.Gauge
processingDuration prometheus.Summary
aggrGroupLimitReached prometheus.Counter
aggrGroupCreationRetries prometheus.Counter
aggrGroupCreationGivenUp prometheus.Counter
}

// NewDispatcherMetrics returns a new registered DispatchMetrics.
Expand All @@ -81,6 +83,18 @@ func NewDispatcherMetrics(registerLimitMetrics bool, r prometheus.Registerer) *D
Help: "Number of times when dispatcher failed to create new aggregation group due to limit.",
},
),
aggrGroupCreationRetries: promauto.With(r).NewCounter(
prometheus.CounterOpts{
Name: "alertmanager_dispatcher_aggregation_group_creation_retries_total",
Help: "Number of CAS retries while creating aggregation groups under contention.",
},
),
aggrGroupCreationGivenUp: promauto.With(r).NewCounter(
prometheus.CounterOpts{
Name: "alertmanager_dispatcher_aggregation_group_creation_given_up_total",
Help: "Number of alerts dropped because aggregation group creation exceeded the retry limit.",
},
),
}

return &m
Expand Down Expand Up @@ -505,6 +519,7 @@ func (d *Dispatcher) groupAlert(ctx context.Context, alert *types.Alert, route *
// We swapped the new group in, we can break and start it.
break
}
loaded = false
} else {
el, loaded = d.routeGroupsSlice[route.Idx].groups.LoadOrStore(fp, ag)
if !loaded {
Expand All @@ -526,8 +541,10 @@ func (d *Dispatcher) groupAlert(ctx context.Context, alert *types.Alert, route *

// If we failed to swap, it means another goroutine has created/modified the group
retries++
d.metrics.aggrGroupCreationRetries.Inc()
if retries > 100 {
// This shouldn't happen - indicates a bug or extreme contention
d.metrics.aggrGroupCreationGivenUp.Inc()
d.logger.Error("excessive retries creating aggregation group",
"fingerprint", fp,
"route", route.Key(),
Expand Down
99 changes: 99 additions & 0 deletions dispatch/dispatch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"fmt"
"log/slog"
"reflect"
"runtime"
"sort"
"sync"
"testing"
Expand Down Expand Up @@ -800,6 +801,104 @@ func TestDispatcher_DoMaintenance(t *testing.T) {
require.Empty(t, mutedBy)
}

// TestGroupAlert_RecoversWhenCASFails reproduces a bug where groupAlert would
// retry the same CompareAndSwap with a stale `el` reference after the slot at
// fp had been mutated by a competing goroutine (or removed by maintenance).
// Without the fix, CAS losers spin until the 100-retry give-up and lose their
// alert. With the fix, they fall back to LoadOrStore and insert into whichever
// live group now occupies the slot.
//
// Each round pre-stores a destroyed aggrGroup at fp and fires many concurrent
// groupAlert calls. We keep running rounds until we observe the contended CAS
// branch firing at least once (cap at maxRounds to avoid hangs on a pathologic
// scheduler) — a single round can be unlucky and serialize, taking the early
// Load+insert path on every goroutine.
func TestGroupAlert_RecoversWhenCASFails(t *testing.T) {
const (
alertsPerRound = 200
maxRounds = 50
)

logger := promslog.NewNopLogger()
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, 0, nil, logger, reg, nil)
require.NoError(t, err)
defer alerts.Close()

route := &Route{
RouteOpts: RouteOpts{
Receiver: "test",
GroupBy: map[model.LabelName]struct{}{"alertname": {}},
GroupWait: time.Hour, // never flush during this test
GroupInterval: time.Hour,
RepeatInterval: time.Hour,
},
Idx: 0,
}
timeout := func(d time.Duration) time.Duration { return d }
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
metrics := NewDispatcherMetrics(false, reg)
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, logger, metrics)
// Don't call Run — put the dispatcher manually in the DispatcherStateWaitingToStart
// state so groupAlert's final switch falls through the default branch and the
// aggregation group's run goroutine is never started.
dispatcher.routeGroupsSlice = []routeAggrGroups{{route: route}}
dispatcher.state.Store(DispatcherStateWaitingToStart) // silences the warn that would happen in unknown mode.
rounds := 0
for rounds < maxRounds && testutil.ToFloat64(metrics.aggrGroupCreationRetries) == 0 {
groupLabels := model.LabelSet{"alertname": model.LabelValue(fmt.Sprintf("shared-%d", rounds))}
destroyedAg := newAggrGroup(context.Background(), groupLabels, route, timeout, marker, logger)
// Mark the store destroyed: empty store + destroyIfEmpty=true.
require.NoError(t, destroyedAg.alerts.DeleteIfNotModified(types.AlertSlice{}, true))
require.True(t, destroyedAg.destroyed())
fp := destroyedAg.fingerprint()
dispatcher.routeGroupsSlice[0].groups.Store(fp, destroyedAg)

var ready, done sync.WaitGroup
ready.Add(alertsPerRound)
done.Add(alertsPerRound)
start := make(chan struct{})
for i := range alertsPerRound {
go func() {
defer done.Done()
alert := newAlert(model.LabelSet{
"alertname": groupLabels["alertname"],
"instance": model.LabelValue(fmt.Sprintf("inst-%d", i)),
})
ready.Done()
<-start
dispatcher.groupAlert(context.Background(), alert, route)
}()
}
ready.Wait()
close(start)
done.Wait()

el, ok := dispatcher.routeGroupsSlice[0].groups.Load(fp)
require.True(t, ok, "round %d: a live group must occupy the fp after the race", rounds)
finalAg := el.(*aggrGroup)
require.False(t, finalAg.destroyed(), "round %d: the final group must not be destroyed", rounds)
require.NotSame(t, destroyedAg, finalAg, "round %d: destroyed group must have been replaced", rounds)
require.Len(t, finalAg.alerts.List(), alertsPerRound, "round %d: all alerts must land in the final group", rounds)
rounds++
}

// Give-ups must stay 0: losers must recover via LoadOrStore, not spin to
// the 100-retry limit.
require.Zero(t, testutil.ToFloat64(metrics.aggrGroupCreationGivenUp), "no alert should be dropped to the give-up branch")

// With GOMAXPROCS=1 the contention can't be fully exercised.
// Skip the check that retries > 0.
if runtime.GOMAXPROCS(0) == 1 {
return
}

// Retries > 0 proves the test actually exercised the contended CAS-recovery
// branch (rather than every goroutine taking the early Load+insert path).
require.Positive(t, testutil.ToFloat64(metrics.aggrGroupCreationRetries), "contended CAS path was not exercised in %d rounds — scheduler is unusually serial", rounds)
}

func TestDispatcher_DeleteResolvedAlertsFromMarker(t *testing.T) {
t.Run("successful flush deletes markers for resolved alerts", func(t *testing.T) {
ctx := context.Background()
Expand Down
98 changes: 73 additions & 25 deletions ui/app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,105 @@
# elm files change during execution.
ELM_FILES = $(shell find src -iname *.elm)
# If JUNIT_DIR is set, the tests are executed with the JUnit reporter and the result is stored in the given directory.
JUNIT_DIR ?=
DOCKER ?= docker
OPENAPI_GEN_IMAGE ?= openapitools/openapi-generator-cli:v3.3.4
JUNIT_DIR ?=
DOCKER ?=
OPENAPI_GEN_IMAGE ?= docker.io/openapitools/openapi-generator-cli:v3.3.4
UI_BUILD_IMAGE ?= quay.io/prometheus/golang-builder:1.26-base # Derived from ../../.promu.yml.
CA_BUNDLE ?=

ifeq ($(DOCKER),)
# Run locally.
NPM = npm
NPX = npx
CONTAINER_RUNTIME = docker # Runtime is needed, since src/Data requires the OPENAPI_GEN_IMAGE.
USER_ARGS = --user $(shell id -u):$(shell id -g)
else
# Run npm and npx inside container runtime.
# `USER_ARGS` prevents creation of root-owned files.
ifeq ($(DOCKER),podman)
USER_ARGS = --userns=keep-id --user $(shell id -u):$(shell id -g)
else
USER_ARGS = --user $(shell id -u):$(shell id -g)
endif
CA_BUNDLE_ARGS = $(if $(CA_BUNDLE),--volume $(CA_BUNDLE):/etc/ssl/certs/ca-certificates.crt:ro \
--env NODE_EXTRA_CA_CERTS=/etc/ssl/certs/ca-certificates.crt)
DOCKER_RUN = $(DOCKER) run --rm -i \
--entrypoint "" \
-v "$(CURDIR):/app" \
-w /app \
$(USER_ARGS) \
--env HOME=/tmp \
$(CA_BUNDLE_ARGS) \
$(UI_BUILD_IMAGE)
NPM = $(DOCKER_RUN) npm
NPX = $(DOCKER_RUN) npx
CONTAINER_RUNTIME = $(DOCKER)
endif

.PHONY: all
all: src/Data build test

node_modules: package-lock.json
npm ci
@echo ">> installing dependencies"
$(NPM) ci

.PHONY: format
format: node_modules $(ELM_FILES)
@echo ">> format front-end code"
@npx elm-format --yes $(ELM_FILES)
@$(NPX) elm-format --yes $(ELM_FILES)

.PHONY: review
review: node_modules
@npx elm-review --fix
@$(NPX) elm-review --fix

.PHONY: test
test: node_modules
rm -rf elm-stuff/generated-code
@npx elm-format $(ELM_FILES) --validate
@npx elm-review
@$(NPX) elm-format $(ELM_FILES) --validate
@$(NPX) elm-review
ifneq ($(JUNIT_DIR),)
mkdir -p $(JUNIT_DIR)
@npx elm-test --report=junit | tee $(JUNIT_DIR)/junit.xml
@$(NPX) elm-test --report=junit | tee $(JUNIT_DIR)/junit.xml
else
@npx elm-test
@$(NPX) elm-test
endif

dev-server:
.PHONY: dev-server
dev-server: node_modules
ifeq ($(DOCKER),)
npx elm reactor
else
$(DOCKER) run --rm -it \
--entrypoint "" \
-v "$(CURDIR):/app" \
-w /app \
-p 8000:8000 \
$(USER_ARGS) \
$(CA_BUNDLE_ARGS) \
$(UI_BUILD_IMAGE) \
npx elm reactor
endif

.PHONY: build
build: .build_stamp
build: dist/.build_stamp

.build_stamp: $(ELM_FILES) index.html vite.config.mjs node_modules
dist/.build_stamp: $(ELM_FILES) index.html vite.config.mjs package-lock.json
@echo ">> building frontend"
npm run build
@touch .build_stamp
$(MAKE) node_modules
$(NPM) run build
@touch dist/.build_stamp

src/Data: ../../api/v2/openapi.yaml
-rm -r src/Data
set -e; \
$(DOCKER) run \
--name openapi-gen \
-rm -rf src/Data src/DateTime.elm
$(CONTAINER_RUNTIME) run --rm \
$(USER_ARGS) \
--entrypoint sh \
-v ${PWD}/../..:/local \
-v ${PWD}/src:/output/src \
$(OPENAPI_GEN_IMAGE) \
generate -i /local/api/v2/openapi.yaml -g elm -o /tmp/openapi-gen; \
trap "$(DOCKER) rm openapi-gen" EXIT; \
$(DOCKER) cp openapi-gen:/tmp/openapi-gen/src/Data src/Data; \
$(DOCKER) cp openapi-gen:/tmp/openapi-gen/src/DateTime.elm src/DateTime.elm
make format
-c 'java -jar /opt/openapi-generator/modules/openapi-generator-cli/target/openapi-generator-cli.jar generate -i /local/api/v2/openapi.yaml -g elm -o /tmp/openapi-gen && cp -r /tmp/openapi-gen/src/Data /output/src/Data && cp /tmp/openapi-gen/src/DateTime.elm /output/src/DateTime.elm'
$(MAKE) format

.PHONY: clean
clean:
- @rm -rf .build_stamp dist elm-stuff src/Data src/DateTime.elm openapi-* elm-*
- @rm -rf dist node_modules elm-stuff src/Data src/DateTime.elm openapi-* elm-*
Loading