diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52c96d041a..8636e79061 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,15 +22,29 @@ jobs: - run: make assets - run: make apiv2 - run: git diff --exit-code + - run: make assets-tarball + - uses: prometheus/promci-artifacts/save@f9a587dbc0b2c78a0c54f8ad1cde71ea29a4b76f # v0.1.0 + with: + directory: .tarballs + - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v4.6.2 + with: + name: ui-dist + path: ui/app/dist + include-hidden-files: true build: name: Build Alertmanager for common architectures + needs: [test_frontend] runs-on: ubuntu-latest strategy: matrix: thread: [0, 1, 2] steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4.0.0 + - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: ui-dist + path: ui/app/dist - uses: prometheus/promci/build@42c3c84c865e5c1ab78543b929f7341eb2ef6123 # v0.6.1 with: promu_opts: "-p linux/amd64 -p windows/amd64 -p linux/arm64 -p darwin/amd64 -p darwin/arm64 -p linux/386" @@ -40,6 +54,7 @@ jobs: test: name: Test runs-on: ubuntu-latest + needs: [test_frontend] # Whenever the Go version is updated here, .promu.yml # should also be updated. container: @@ -58,14 +73,10 @@ jobs: EMAIL_AUTH_CONFIG: testdata/auth.yml steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - id: cache-paths - run: echo "npm-cache=$(npm config get cache)" >> $GITHUB_OUTPUT - - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 + - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: - path: ${{ steps.cache-paths.outputs.npm-cache }} - key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} - restore-keys: | - ${{ runner.os }}-node- + name: ui-dist + path: ui/app/dist - uses: prometheus/promci-setup@5af30ba8c199a91d6c04ebdc3c48e630e355f62d # v0.1.0 - run: make - run: git diff --exit-code diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 113b7916ea..c3b5e57d55 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -18,6 +18,10 @@ jobs: needs: ci steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: ui-dist + path: ui/app/dist - uses: prometheus/promci/build@42c3c84c865e5c1ab78543b929f7341eb2ef6123 # v0.6.1 with: parallelism: 12 diff --git a/.promu.yml b/.promu.yml index f54b3eaa8b..8d1f6ae64e 100644 --- a/.promu.yml +++ b/.promu.yml @@ -1,6 +1,6 @@ go: # Whenever the Go version is updated here, - # .circle/config.yml should also be updated. + # .github/workflows/ci.yml and ui/app/Makefile should also be updated. version: 1.26 repository: path: github.com/prometheus/alertmanager diff --git a/CHANGELOG.md b/CHANGELOG.md index 3af6e90b5b..6b62cc61c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ * [FEATURE] ... * [ENHANCEMENT] ... +## 0.32.1-rc.0 / 2026-04-27 + +* [BUGFIX] dispatcher: Fix issue with dispatching to a contended route. #5179 +* [BUGFIX] ui: Provide prebuilt ui assets in release. #5191 + ## 0.32.0 / 2026-04-10 * [CHANGE] `go get github.com/prometheus/alertmanager/ui` will now fail as compiled UI assets are no longer checked into the repository. Downstream builds that rely on these assets being present in the source tree must now build the UI from source. #5113 diff --git a/Makefile b/Makefile index 18aca0ecfb..11554de9e3 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,11 @@ lint: ui-elm common-lint .PHONY: assets assets: $(FRONTEND_DIR)/src/Data ui-elm template/email.tmpl +.PHONY: assets-tarball +assets-tarball: ui-elm + mkdir -p .tarballs + tar czf ".tarballs/alertmanager-web-ui-$(file 100 { // This shouldn't happen - indicates a bug or extreme contention + d.metrics.aggrGroupCreationGivenUp.Inc() d.logger.Error("excessive retries creating aggregation group", "fingerprint", fp, "route", route.Key(), diff --git a/dispatch/dispatch_test.go b/dispatch/dispatch_test.go index d19a584f2f..6ecfa5991a 100644 --- a/dispatch/dispatch_test.go +++ b/dispatch/dispatch_test.go @@ -18,6 +18,7 @@ import ( "fmt" "log/slog" "reflect" + "runtime" "sort" "sync" "testing" @@ -800,6 +801,104 @@ func TestDispatcher_DoMaintenance(t *testing.T) { require.Empty(t, mutedBy) } +// TestGroupAlert_RecoversWhenCASFails reproduces a bug where groupAlert would +// retry the same CompareAndSwap with a stale `el` reference after the slot at +// fp had been mutated by a competing goroutine (or removed by maintenance). +// Without the fix, CAS losers spin until the 100-retry give-up and lose their +// alert. With the fix, they fall back to LoadOrStore and insert into whichever +// live group now occupies the slot. +// +// Each round pre-stores a destroyed aggrGroup at fp and fires many concurrent +// groupAlert calls. We keep running rounds until we observe the contended CAS +// branch firing at least once (cap at maxRounds to avoid hangs on a pathologic +// scheduler) — a single round can be unlucky and serialize, taking the early +// Load+insert path on every goroutine. +func TestGroupAlert_RecoversWhenCASFails(t *testing.T) { + const ( + alertsPerRound = 200 + maxRounds = 50 + ) + + logger := promslog.NewNopLogger() + reg := prometheus.NewRegistry() + marker := types.NewMarker(reg) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, 0, nil, logger, reg, nil) + require.NoError(t, err) + defer alerts.Close() + + route := &Route{ + RouteOpts: RouteOpts{ + Receiver: "test", + GroupBy: map[model.LabelName]struct{}{"alertname": {}}, + GroupWait: time.Hour, // never flush during this test + GroupInterval: time.Hour, + RepeatInterval: time.Hour, + }, + Idx: 0, + } + timeout := func(d time.Duration) time.Duration { return d } + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} + metrics := NewDispatcherMetrics(false, reg) + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, logger, metrics) + // Don't call Run — put the dispatcher manually in the DispatcherStateWaitingToStart + // state so groupAlert's final switch falls through the default branch and the + // aggregation group's run goroutine is never started. + dispatcher.routeGroupsSlice = []routeAggrGroups{{route: route}} + dispatcher.state.Store(DispatcherStateWaitingToStart) // silences the warn that would happen in unknown mode. + rounds := 0 + for rounds < maxRounds && testutil.ToFloat64(metrics.aggrGroupCreationRetries) == 0 { + groupLabels := model.LabelSet{"alertname": model.LabelValue(fmt.Sprintf("shared-%d", rounds))} + destroyedAg := newAggrGroup(context.Background(), groupLabels, route, timeout, marker, logger) + // Mark the store destroyed: empty store + destroyIfEmpty=true. + require.NoError(t, destroyedAg.alerts.DeleteIfNotModified(types.AlertSlice{}, true)) + require.True(t, destroyedAg.destroyed()) + fp := destroyedAg.fingerprint() + dispatcher.routeGroupsSlice[0].groups.Store(fp, destroyedAg) + + var ready, done sync.WaitGroup + ready.Add(alertsPerRound) + done.Add(alertsPerRound) + start := make(chan struct{}) + for i := range alertsPerRound { + go func() { + defer done.Done() + alert := newAlert(model.LabelSet{ + "alertname": groupLabels["alertname"], + "instance": model.LabelValue(fmt.Sprintf("inst-%d", i)), + }) + ready.Done() + <-start + dispatcher.groupAlert(context.Background(), alert, route) + }() + } + ready.Wait() + close(start) + done.Wait() + + el, ok := dispatcher.routeGroupsSlice[0].groups.Load(fp) + require.True(t, ok, "round %d: a live group must occupy the fp after the race", rounds) + finalAg := el.(*aggrGroup) + require.False(t, finalAg.destroyed(), "round %d: the final group must not be destroyed", rounds) + require.NotSame(t, destroyedAg, finalAg, "round %d: destroyed group must have been replaced", rounds) + require.Len(t, finalAg.alerts.List(), alertsPerRound, "round %d: all alerts must land in the final group", rounds) + rounds++ + } + + // Give-ups must stay 0: losers must recover via LoadOrStore, not spin to + // the 100-retry limit. + require.Zero(t, testutil.ToFloat64(metrics.aggrGroupCreationGivenUp), "no alert should be dropped to the give-up branch") + + // With GOMAXPROCS=1 the contention can't be fully exercised. + // Skip the check that retries > 0. + if runtime.GOMAXPROCS(0) == 1 { + return + } + + // Retries > 0 proves the test actually exercised the contended CAS-recovery + // branch (rather than every goroutine taking the early Load+insert path). + require.Positive(t, testutil.ToFloat64(metrics.aggrGroupCreationRetries), "contended CAS path was not exercised in %d rounds — scheduler is unusually serial", rounds) +} + func TestDispatcher_DeleteResolvedAlertsFromMarker(t *testing.T) { t.Run("successful flush deletes markers for resolved alerts", func(t *testing.T) { ctx := context.Background() diff --git a/ui/app/Makefile b/ui/app/Makefile index 00e44c9152..ed925497d0 100644 --- a/ui/app/Makefile +++ b/ui/app/Makefile @@ -2,57 +2,105 @@ # elm files change during execution. ELM_FILES = $(shell find src -iname *.elm) # If JUNIT_DIR is set, the tests are executed with the JUnit reporter and the result is stored in the given directory. -JUNIT_DIR ?= -DOCKER ?= docker -OPENAPI_GEN_IMAGE ?= openapitools/openapi-generator-cli:v3.3.4 +JUNIT_DIR ?= +DOCKER ?= +OPENAPI_GEN_IMAGE ?= docker.io/openapitools/openapi-generator-cli:v3.3.4 +UI_BUILD_IMAGE ?= quay.io/prometheus/golang-builder:1.26-base # Derived from ../../.promu.yml. +CA_BUNDLE ?= +ifeq ($(DOCKER),) + # Run locally. + NPM = npm + NPX = npx + CONTAINER_RUNTIME = docker # Runtime is needed, since src/Data requires the OPENAPI_GEN_IMAGE. + USER_ARGS = --user $(shell id -u):$(shell id -g) +else + # Run npm and npx inside container runtime. + # `USER_ARGS` prevents creation of root-owned files. + ifeq ($(DOCKER),podman) + USER_ARGS = --userns=keep-id --user $(shell id -u):$(shell id -g) + else + USER_ARGS = --user $(shell id -u):$(shell id -g) + endif + CA_BUNDLE_ARGS = $(if $(CA_BUNDLE),--volume $(CA_BUNDLE):/etc/ssl/certs/ca-certificates.crt:ro \ + --env NODE_EXTRA_CA_CERTS=/etc/ssl/certs/ca-certificates.crt) + DOCKER_RUN = $(DOCKER) run --rm -i \ + --entrypoint "" \ + -v "$(CURDIR):/app" \ + -w /app \ + $(USER_ARGS) \ + --env HOME=/tmp \ + $(CA_BUNDLE_ARGS) \ + $(UI_BUILD_IMAGE) + NPM = $(DOCKER_RUN) npm + NPX = $(DOCKER_RUN) npx + CONTAINER_RUNTIME = $(DOCKER) +endif + +.PHONY: all all: src/Data build test node_modules: package-lock.json - npm ci + @echo ">> installing dependencies" + $(NPM) ci +.PHONY: format format: node_modules $(ELM_FILES) @echo ">> format front-end code" - @npx elm-format --yes $(ELM_FILES) + @$(NPX) elm-format --yes $(ELM_FILES) +.PHONY: review review: node_modules - @npx elm-review --fix + @$(NPX) elm-review --fix +.PHONY: test test: node_modules rm -rf elm-stuff/generated-code - @npx elm-format $(ELM_FILES) --validate - @npx elm-review + @$(NPX) elm-format $(ELM_FILES) --validate + @$(NPX) elm-review ifneq ($(JUNIT_DIR),) mkdir -p $(JUNIT_DIR) - @npx elm-test --report=junit | tee $(JUNIT_DIR)/junit.xml + @$(NPX) elm-test --report=junit | tee $(JUNIT_DIR)/junit.xml else - @npx elm-test + @$(NPX) elm-test endif -dev-server: +.PHONY: dev-server +dev-server: node_modules +ifeq ($(DOCKER),) npx elm reactor +else + $(DOCKER) run --rm -it \ + --entrypoint "" \ + -v "$(CURDIR):/app" \ + -w /app \ + -p 8000:8000 \ + $(USER_ARGS) \ + $(CA_BUNDLE_ARGS) \ + $(UI_BUILD_IMAGE) \ + npx elm reactor +endif .PHONY: build -build: .build_stamp +build: dist/.build_stamp -.build_stamp: $(ELM_FILES) index.html vite.config.mjs node_modules +dist/.build_stamp: $(ELM_FILES) index.html vite.config.mjs package-lock.json @echo ">> building frontend" - npm run build - @touch .build_stamp + $(MAKE) node_modules + $(NPM) run build + @touch dist/.build_stamp src/Data: ../../api/v2/openapi.yaml - -rm -r src/Data - set -e; \ - $(DOCKER) run \ - --name openapi-gen \ + -rm -rf src/Data src/DateTime.elm + $(CONTAINER_RUNTIME) run --rm \ + $(USER_ARGS) \ + --entrypoint sh \ -v ${PWD}/../..:/local \ + -v ${PWD}/src:/output/src \ $(OPENAPI_GEN_IMAGE) \ - generate -i /local/api/v2/openapi.yaml -g elm -o /tmp/openapi-gen; \ - trap "$(DOCKER) rm openapi-gen" EXIT; \ - $(DOCKER) cp openapi-gen:/tmp/openapi-gen/src/Data src/Data; \ - $(DOCKER) cp openapi-gen:/tmp/openapi-gen/src/DateTime.elm src/DateTime.elm - make format + -c 'java -jar /opt/openapi-generator/modules/openapi-generator-cli/target/openapi-generator-cli.jar generate -i /local/api/v2/openapi.yaml -g elm -o /tmp/openapi-gen && cp -r /tmp/openapi-gen/src/Data /output/src/Data && cp /tmp/openapi-gen/src/DateTime.elm /output/src/DateTime.elm' + $(MAKE) format .PHONY: clean clean: - - @rm -rf .build_stamp dist elm-stuff src/Data src/DateTime.elm openapi-* elm-* + - @rm -rf dist node_modules elm-stuff src/Data src/DateTime.elm openapi-* elm-*