From c01ed631d6d28733d0057bd6dbd37acc86fb3bd2 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 1 Jun 2026 09:38:55 +0200
Subject: [PATCH 1/4] refactor(routing): extract replica picker into
 pkg/clusterrouting (#10123)

Move ReplicaCandidate and PickBestReplica out of core/services/nodes (which depends on gorm) into a new dependency-light leaf package pkg/clusterrouting, so the p2p federation server can later share the same replica-selection policy without pulling in a database driver.

core/services/nodes keeps a type alias and a thin delegator, so every existing reference (the LoadedReplicaStats interface method, the ReplicaCandidate row conversion in registry.go, and the SQL policy-mirror test) compiles and behaves unchanged. This is a pure, behavior-preserving refactor: the full nodes suite, including the policy-mirror spec that pins the SQL ORDER BY to PickBestReplica, stays green.

Assisted-by: Claude Code:claude-opus-4-8

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/services/nodes/replicapicker.go          | 74 +++----------------
 .../clusterrouting_suite_test.go              | 13 ++++
 pkg/clusterrouting/replica.go                 | 66 +++++++++++++++++
 .../clusterrouting/replica_test.go            |  6 +-
 4 files changed, 93 insertions(+), 66 deletions(-)
 create mode 100644 pkg/clusterrouting/clusterrouting_suite_test.go
 create mode 100644 pkg/clusterrouting/replica.go
 rename core/services/nodes/replicapicker_test.go => pkg/clusterrouting/replica_test.go (95%)

diff --git a/core/services/nodes/replicapicker.go b/core/services/nodes/replicapicker.go
index 56d383e6178b..0c784060fc33 100644
--- a/core/services/nodes/replicapicker.go
+++ b/core/services/nodes/replicapicker.go
@@ -1,69 +1,17 @@
 package nodes
 
-import "time"
+import "github.com/mudler/LocalAI/pkg/clusterrouting"
 
-// ReplicaCandidate is the minimum view of a loaded model replica needed to
-// apply the routing policy. It is intentionally decoupled from the gorm models
-// (BackendNode, NodeModel) so the same picker can run against fresh DB rows
-// (SmartRouter.Route → FindAndLockNodeWithModel) and against an in-memory
-// snapshot (the per-frontend rotating cache flagged in pkg/model — see TODO
-// below).
-type ReplicaCandidate struct {
-	NodeID        string
-	Address       string
-	ReplicaIndex  int
-	InFlight      int
-	LastUsed      time.Time
-	AvailableVRAM uint64
-}
+// ReplicaCandidate aliases the canonical type in pkg/clusterrouting. The policy
+// implementation moved there so the p2p federation server can share it without
+// importing this package (which pulls in gorm). Because this is a type alias,
+// existing references such as the LoadedReplicaStats interface method and the
+// ReplicaCandidate(rw) row conversion in registry.go remain valid unchanged.
+type ReplicaCandidate = clusterrouting.ReplicaCandidate
 
-// PickBestReplica is the single source of truth for which loaded replica of a
-// model serves the next request.
-//
-// Policy (ordered tiers, first non-tie wins):
-//  1. Least in-flight wins — primary load-balancing signal.
-//  2. Oldest last_used wins — round-robin between equally-loaded replicas.
-//     Every successful pick refreshes last_used (in FindAndLockNodeWithModel's
-//     transaction and in TouchNodeModel on cache hits), so the "oldest" tier
-//     naturally rotates through the candidate set without a separate cursor.
-//  3. Largest available_vram wins — cold-start tiebreaker for replicas that
-//     have never been picked (identical last_used).
-//
-// Two callers must agree on this policy:
-//
-//   - SmartRouter.Route, via the SQL ORDER BY in FindAndLockNodeWithModel
-//     (registry.go). That query MUST mirror this function — TestPickerSQLMirror
-//     asserts both sides agree on a representative dataset.
-//
-//   - The per-frontend rotating-replica cache (NOT YET IMPLEMENTED — see
-//     pkg/model/loader.go and pkg/model/initializers.go for the integration
-//     point). When that cache lands, it will call PickBestReplica against an
-//     in-memory snapshot using locally-tracked in-flight counters and skip the
-//     per-request DB round-trip.
-//
-// Returns nil when the candidate list is empty. Does not allocate.
+// PickBestReplica delegates to the canonical implementation in pkg/clusterrouting.
+// The SQL ORDER BY in FindAndLockNodeWithModel (registry.go) must mirror it; the
+// "policy mirror" spec in registry_test.go asserts they agree.
 func PickBestReplica(candidates []ReplicaCandidate) *ReplicaCandidate {
-	if len(candidates) == 0 {
-		return nil
-	}
-	best := &candidates[0]
-	for i := 1; i < len(candidates); i++ {
-		c := &candidates[i]
-		if betterReplica(c, best) {
-			best = c
-		}
-	}
-	return best
-}
-
-// betterReplica reports whether candidate a is preferred over candidate b
-// under the policy documented on PickBestReplica.
-func betterReplica(a, b *ReplicaCandidate) bool {
-	if a.InFlight != b.InFlight {
-		return a.InFlight < b.InFlight
-	}
-	if !a.LastUsed.Equal(b.LastUsed) {
-		return a.LastUsed.Before(b.LastUsed)
-	}
-	return a.AvailableVRAM > b.AvailableVRAM
+	return clusterrouting.PickBestReplica(candidates)
 }
diff --git a/pkg/clusterrouting/clusterrouting_suite_test.go b/pkg/clusterrouting/clusterrouting_suite_test.go
new file mode 100644
index 000000000000..9301ff6624c2
--- /dev/null
+++ b/pkg/clusterrouting/clusterrouting_suite_test.go
@@ -0,0 +1,13 @@
+package clusterrouting
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestClusterRouting(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "ClusterRouting Suite")
+}
diff --git a/pkg/clusterrouting/replica.go b/pkg/clusterrouting/replica.go
new file mode 100644
index 000000000000..9c3beceff033
--- /dev/null
+++ b/pkg/clusterrouting/replica.go
@@ -0,0 +1,66 @@
+// Package clusterrouting holds the transport-agnostic replica selection policy
+// shared by the NATS distributed mode (core/services/nodes) and the p2p
+// federation server (core/p2p). It deliberately depends on nothing heavier than
+// the standard library so either transport can import it without pulling in a
+// database driver or message bus.
+package clusterrouting
+
+import "time"
+
+// ReplicaCandidate is the minimum view of a loaded model replica needed to
+// apply the routing policy. It is intentionally decoupled from any storage
+// model (gorm rows on the NATS side, gossiped NodeData on the p2p side) so the
+// same picker runs against fresh DB rows, an in-memory snapshot, or p2p gossip.
+type ReplicaCandidate struct {
+	NodeID        string
+	Address       string
+	ReplicaIndex  int
+	InFlight      int
+	LastUsed      time.Time
+	AvailableVRAM uint64
+}
+
+// PickBestReplica is the single source of truth for which loaded replica of a
+// model serves the next request.
+//
+// Policy (ordered tiers, first non-tie wins):
+//  1. Least in-flight wins: primary load-balancing signal.
+//  2. Oldest last_used wins: round-robin between equally-loaded replicas.
+//     Every successful pick refreshes last_used (in the NATS
+//     FindAndLockNodeWithModel transaction and in TouchNodeModel on cache
+//     hits), so the "oldest" tier naturally rotates through the candidate set
+//     without a separate cursor.
+//  3. Largest available_vram wins: cold-start tiebreaker for replicas that
+//     have never been picked (identical last_used).
+//
+// The NATS SQL ORDER BY in FindAndLockNodeWithModel (registry.go) MUST mirror
+// this function; registry_test.go's "agrees with PickBestReplica on a seeded
+// dataset (policy mirror)" spec asserts both sides agree on a representative
+// dataset and fails fast if they drift.
+//
+// Returns nil when the candidate list is empty. Does not allocate.
+func PickBestReplica(candidates []ReplicaCandidate) *ReplicaCandidate {
+	if len(candidates) == 0 {
+		return nil
+	}
+	best := &candidates[0]
+	for i := 1; i < len(candidates); i++ {
+		c := &candidates[i]
+		if betterReplica(c, best) {
+			best = c
+		}
+	}
+	return best
+}
+
+// betterReplica reports whether candidate a is preferred over candidate b
+// under the policy documented on PickBestReplica.
+func betterReplica(a, b *ReplicaCandidate) bool {
+	if a.InFlight != b.InFlight {
+		return a.InFlight < b.InFlight
+	}
+	if !a.LastUsed.Equal(b.LastUsed) {
+		return a.LastUsed.Before(b.LastUsed)
+	}
+	return a.AvailableVRAM > b.AvailableVRAM
+}
diff --git a/core/services/nodes/replicapicker_test.go b/pkg/clusterrouting/replica_test.go
similarity index 95%
rename from core/services/nodes/replicapicker_test.go
rename to pkg/clusterrouting/replica_test.go
index d71b83808dd5..5627fa72d49b 100644
--- a/core/services/nodes/replicapicker_test.go
+++ b/pkg/clusterrouting/replica_test.go
@@ -1,4 +1,4 @@
-package nodes
+package clusterrouting
 
 import (
 	"time"
@@ -36,7 +36,7 @@ var _ = Describe("PickBestReplica", func() {
 
 	It("uses oldest last_used as the tiebreaker when in_flight ties", func() {
 		// All three tied on in_flight=0. Without last_used, available_vram
-		// would pin every pick to the fattest node — the exact bug
+		// would pin every pick to the fattest node: the exact bug
 		// fix(distributed): round-robin replicas of the same model addressed.
 		cs := []ReplicaCandidate{
 			{NodeID: "fat-recent", InFlight: 0, LastUsed: ref.Add(2 * time.Second), AvailableVRAM: 24_000_000_000},
@@ -47,7 +47,7 @@ var _ = Describe("PickBestReplica", func() {
 	})
 
 	It("uses largest available_vram as the final tiebreaker", func() {
-		// in_flight tied AND last_used tied — pick the largest GPU.
+		// in_flight tied AND last_used tied: pick the largest GPU.
 		cs := []ReplicaCandidate{
 			{NodeID: "small", InFlight: 0, LastUsed: ref, AvailableVRAM: 8_000_000_000},
 			{NodeID: "fat", InFlight: 0, LastUsed: ref, AvailableVRAM: 24_000_000_000},

From 5a0013defeae01cbd36d7333f1673294c8b35dfc Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Mon, 1 Jun 2026 13:24:36 +0100
Subject: [PATCH 2/4] test(react-ui): add page render-smoke specs, reset the
 coverage gate (#10122)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The UI coverage gate was tightened to 0.1pp against a fast-local
measurement (39.86% baseline); CI's slower runners measure ~0.9pp lower,
so tests-ui-e2e failed there. UI e2e coverage is diffusely
non-deterministic and tracks machine speed — a 0.1pp band can't hold
across environments.

Rather than loosen the gate, raise the floor under it: a render-smoke
spec mounts each lazy page (navigate + assert the header renders),
covering a dozen previously-untested pages and lifting coverage from
~39% to ~42.7% locally. Restore the tolerance to 0.8pp and set the
baseline conservatively (40.0), below the slow-CI floor, so the ratchet
holds without flapping.

Document the coverage policy — install the git hooks and don't bypass
them (no --no-verify, no hand-lowering the baseline or widening the
tolerance); raise coverage by adding tests instead; set the UI baseline
below the slow-CI floor — in AGENTS.md, CONTRIBUTING.md and
.agents/building-and-testing.md.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 .agents/building-and-testing.md               | 15 ++++---
 AGENTS.md                                     |  1 +
 CONTRIBUTING.md                               |  6 +++
 core/http/react-ui/coverage-baseline.txt      |  2 +-
 .../react-ui/e2e/page-render-smoke.spec.js    | 40 +++++++++++++++++++
 scripts/ui-coverage-check.sh                  | 33 ++++++++-------
 6 files changed, 76 insertions(+), 21 deletions(-)
 create mode 100644 core/http/react-ui/e2e/page-render-smoke.spec.js

diff --git a/.agents/building-and-testing.md b/.agents/building-and-testing.md
index 04df3426e300..3cf85c0dc283 100644
--- a/.agents/building-and-testing.md
+++ b/.agents/building-and-testing.md
@@ -38,9 +38,12 @@ The React UI (`core/http/react-ui/`) has **no component/unit tests** — its onl
 - **Browser:** the flake dev shell ships `chromium` and exports `PLAYWRIGHT_CHROMIUM_PATH`; `playwright.config.js` uses it via `launchOptions.executablePath`, and the Makefile skips `playwright install` when it's set. This avoids Playwright's downloaded browser, which can't resolve system libs (`libglib-2.0`, …) on NixOS. In CI (no `PLAYWRIGHT_CHROMIUM_PATH`) the Makefile falls back to `playwright install --with-deps chromium`.
 - The app is a React SPA, so coverage accumulates across in-app navigation within a test; a full `page.goto`/reload resets it.
 - `.nycrc.json` uses `all: true`, so **every `src/**` file is in the report**, including 0%-coverage ones — that's how you spot features with no test at all (sort the HTML report or `coverage-summary.json` by line% ascending). 
-- **UI coverage gate:** `make test-ui-coverage-check` runs the suite then `scripts/ui-coverage-check.sh`, failing if total line coverage drops more than `UI_COVERAGE_TOLERANCE` (default **1.0pp**) below `core/http/react-ui/coverage-baseline.txt`. `make test-ui-coverage-baseline` regenerates the baseline. **Why a tolerance (unlike the strict Go gate):** UI e2e line coverage is *non-deterministic* — async/debounced paths (e.g. the VRAM estimate's 500ms debounce) make identical specs vary ~0.5pp run-to-run, so a zero-tolerance gate would flake. Keep the tolerance just above the observed jitter. Run in CI (`tests-ui-e2e.yml`) and pre-commit on `core/http/react-ui/` changes.
-
-Rules:
-- The gate is **strict — there is no tolerance**. Any decrease fails, regardless of how many lines a PR adds or deletes. `covermode=atomic` makes line coverage deterministic, so there's no run-to-run jitter to excuse.
-- When a change legitimately **raises** coverage, run `make test-coverage-baseline` and **commit** the updated `coverage-baseline.txt` so the ratchet moves up. Never lower the baseline by hand.
-- If you can't get coverage back to baseline, the fix is to **add tests**, not to edit the baseline.
+- **UI coverage gate:** `make test-ui-coverage-check` runs the suite then `scripts/ui-coverage-check.sh`, failing if total line coverage drops more than `UI_COVERAGE_TOLERANCE` below `core/http/react-ui/coverage-baseline.txt`. `make test-ui-coverage-baseline` regenerates the baseline. Runs in CI (`tests-ui-e2e.yml`) and pre-commit on `core/http/react-ui/` changes.
+- **Why it has a tolerance (unlike the strict Go gate):** UI e2e coverage is *non-deterministic*. Specs that assert on state and end while async/lazy render work is still in flight collect those lines only when the render beats the coverage teardown — so the total drifts with machine speed/load (a fast local box reads higher than a slow CI runner), diffusely across many specs. The tolerance absorbs that drift, so set the baseline *below* the slow-CI floor, never to a fast-local `make test-ui-coverage-baseline` number, or CI flaps.
+- **Raising coverage is cheap:** a *render-smoke* spec (navigate to a route, assert its header renders) mounts a lazy page and runs its full render + initial effects, capturing most of its lines in a few lines of test — see `e2e/page-render-smoke.spec.js`. Auth is disabled in the test server (`isAdmin=true`), so `RequireAdmin`/`RequireFeature` routes render without a mock. The most *deterministic* win is removing a race: make a spec `await` a rendered element before ending (see `e2e/agents.spec.js` → AgentCreate) so its lines count every run.
+
+Rules (both gates):
+- **Install the hooks:** `make install-hooks` once per clone so lint + coverage run pre-commit. Don't lean on CI for what the hook catches.
+- **Don't work around the gate:** never `git commit --no-verify`, and never hand-lower a baseline or widen a tolerance to turn a red gate green. The ratchet only moves up.
+- If a change drops coverage, **add tests** (sort `coverage-summary.json` by line% ascending to find untested code) rather than editing the baseline. When coverage legitimately rises, commit the regenerated baseline (`make test-coverage-baseline` / `test-ui-coverage-baseline`).
+- The Go gate is **strict — no tolerance**; `covermode=atomic` keeps it deterministic. The UI gate keeps a small tolerance only because its e2e coverage isn't.
diff --git a/AGENTS.md b/AGENTS.md
index 1d7e29e9cfae..9f397e613fca 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -35,6 +35,7 @@ LocalAI follows the Linux kernel project's [guidelines for AI coding assistants]
 
 ## Quick Reference
 
+- **Git hooks & coverage gates**: Run `make install-hooks` once per clone so the pre-commit lint + coverage gates run. **Never bypass them with `git commit --no-verify`, and never lower a coverage baseline or widen a gate's tolerance to turn a red gate green** — the coverage ratchet only moves up. If a change drops coverage, add tests to raise it (e.g. render-smoke specs). See [.agents/building-and-testing.md](.agents/building-and-testing.md).
 - **Logging**: Use `github.com/mudler/xlog` (same API as slog)
 - **Go style**: Prefer `any` over `interface{}`
 - **Comments**: Explain *why*, not *what*
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index df1c78909ac6..c45e269b27da 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -266,6 +266,12 @@ The e2e tests run LocalAI in a Docker container and exercise the API:
 make test-e2e
 ```
 
+### React UI tests and coverage
+
+The React UI (`core/http/react-ui/`) is covered by Playwright e2e specs, gated by a **monotonic line-coverage ratchet** (`make test-ui-coverage-check`, run in CI and pre-commit). The metric is non-deterministic — a fast local box reads higher than a slow CI runner for the same code — so a small tolerance is unavoidable.
+
+**If your change lowers UI coverage, raise it back by adding specs — do not widen the tolerance or hand-lower the baseline.** A *render-smoke* spec (navigate to a page, assert its header is visible) cheaply covers an entire lazy page. See `core/http/react-ui/e2e/page-render-smoke.spec.js` and the full policy in [.agents/building-and-testing.md](.agents/building-and-testing.md#react-ui-coverage).
+
 ### Running E2E container tests
 
 These tests build a standard LocalAI Docker image and run it with pre-configured model configs to verify that most endpoints work correctly:
diff --git a/core/http/react-ui/coverage-baseline.txt b/core/http/react-ui/coverage-baseline.txt
index b4be1a3b71c6..b2e09eeb0f44 100644
--- a/core/http/react-ui/coverage-baseline.txt
+++ b/core/http/react-ui/coverage-baseline.txt
@@ -1 +1 @@
-39.86
\ No newline at end of file
+40.0
\ No newline at end of file
diff --git a/core/http/react-ui/e2e/page-render-smoke.spec.js b/core/http/react-ui/e2e/page-render-smoke.spec.js
new file mode 100644
index 000000000000..40cfa1897599
--- /dev/null
+++ b/core/http/react-ui/e2e/page-render-smoke.spec.js
@@ -0,0 +1,40 @@
+import { test, expect } from './coverage-fixtures.js'
+
+// Render-smoke coverage. Each page is lazy-loaded and runs its full render +
+// initial effects on mount, so a bare visit captures the bulk of a page's
+// lines — cheap, real coverage for pages that have no dedicated spec yet.
+//
+// This is the project's preferred way to keep the UI coverage gate green:
+// raise the floor by covering more, rather than loosening the gate's
+// tolerance (see CONTRIBUTING.md → "React UI coverage"). Auth is disabled in
+// the test server, so RequireAdmin/RequireFeature resolve to isAdmin=true and
+// every gated route renders without an auth/capability mock.
+//
+// Asserts the page mounted (its .page-title header is visible) and that it did
+// not bounce to a gate redirect (/login or back to /app home).
+const PAGES = [
+  ['/app/talk', 'Talk'],
+  ['/app/usage', 'Usage'],
+  ['/app/account', 'Account'],
+  ['/app/studio', 'Studio'],
+  ['/app/manage', 'Manage'],
+  ['/app/backends', 'Backends'],
+  ['/app/settings', 'Settings'],
+  ['/app/nodes', 'Nodes'],
+  ['/app/face', 'Face recognition'],
+  ['/app/voice', 'Voice recognition'],
+  ['/app/fine-tune', 'Fine-tuning'],
+  ['/app/quantize', 'Quantize'],
+]
+
+test.describe('Page render smoke', () => {
+  for (const [path, label] of PAGES) {
+    test(`renders ${label} (${path})`, async ({ page }) => {
+      await page.goto(path)
+      // .page-title for the normal header; .empty-state-title for pages that
+      // render a gated/empty state (e.g. Account when auth is disabled).
+      await expect(page.locator('.page-title, .empty-state-title').first()).toBeVisible({ timeout: 15_000 })
+      await expect(page).toHaveURL(new RegExp(path.replace(/\//g, '\\/') + '$'))
+    })
+  }
+})
diff --git a/scripts/ui-coverage-check.sh b/scripts/ui-coverage-check.sh
index 33a43748c19c..9d24df7ee09b 100755
--- a/scripts/ui-coverage-check.sh
+++ b/scripts/ui-coverage-check.sh
@@ -4,28 +4,33 @@
 #
 # Compares the total line coverage in an nyc coverage-summary.json against a
 # committed baseline and fails (exit 1) if it dropped by more than
-# UI_COVERAGE_TOLERANCE percentage points (default 0.1). The React UI e2e suite
+# UI_COVERAGE_TOLERANCE percentage points (default 0.8). The React UI e2e suite
 # drives the real app, so a removed feature or deleted spec shows up as a
 # coverage drop here.
 #
-# The tolerance exists only to absorb the irreducible measurement noise floor,
-# NOT to permit regression. UI e2e coverage USED to swing ~1pp run-to-run, which
-# forced a loose 0.8pp band — but that swing was a bug, not inherent jitter: a
-# spec that navigated to a route and ended on the URL assertion let the target
-# component's render race the coverage teardown, so ~400 lines were collected
-# only when the render won (see e2e/agents.spec.js → AgentCreate). With that race
-# fixed, repeated runs land within ~0.013pp (a handful of lines) of each other,
-# so the band is tightened to 0.1pp — enough for the noise floor, tight enough
-# that a real ~40-line regression still trips the gate. If a future run wobbles
-# more, fix the racing spec (await a rendered element) rather than loosening this.
+# Why the band is this wide: UI e2e line coverage is NOT deterministic. Many
+# specs assert on state and end while async/lazy render work is still in flight,
+# so those lines are collected only when the render beats the coverage teardown
+# — and that depends on machine speed/load. The effect is diffuse (spread across
+# dozens of specs, no single dominant file) and tracks the runner: a quiet local
+# box measures ~0.9pp higher than a slow/loaded CI runner for the SAME tree
+# (observed: 39.9% local vs 39.0% CI). The tolerance absorbs that spread; setting
+# it tighter (it was briefly 0.1pp, calibrated to a lucky fast-local cluster)
+# makes CI flap.
 #
-# When coverage rises meaningfully, regenerate and commit the baseline with:
-#   make test-ui-coverage-baseline
+# The principled way to tighten this is to remove the variance at the source —
+# make each racing spec await a rendered element before ending (e2e/agents.spec.js
+# → AgentCreate fixed the single biggest one) — NOT to chase the baseline up to a
+# fast-machine high or loosen further. Keep the baseline conservatively at or
+# below the slow-runner floor so the band catches real regressions, not jitter.
+#
+# When coverage rises meaningfully AND reproducibly (check on a slow/CI-like run),
+# regenerate and commit the baseline with:  make test-ui-coverage-baseline
 set -eu
 
 summary="${1:?usage: ui-coverage-check.sh SUMMARY_JSON BASELINE_FILE}"
 baseline_file="${2:?usage: ui-coverage-check.sh SUMMARY_JSON BASELINE_FILE}"
-tolerance="${UI_COVERAGE_TOLERANCE:-0.1}"
+tolerance="${UI_COVERAGE_TOLERANCE:-0.8}"
 
 if [ ! -f "$summary" ]; then
 	echo "ui-coverage-check: coverage summary not found: $summary" >&2

From 7013e13f052e3381a393afd3fc6d546e163a40fd Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 1 Jun 2026 14:24:51 +0200
Subject: [PATCH 3/4] chore: :arrow_up: Update ggml-org/llama.cpp to
 `399739d5c5978351f39e3454bfbfbab4f369088f` (#10119)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index b80e8b99a8ad..770de8aaed3d 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=d6588daa800058dfa54f1d7ea695b1a810c8ae18
+LLAMA_VERSION?=399739d5c5978351f39e3454bfbfbab4f369088f
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=

From c61838dba638dd592e1068b7e7a859f70cba3006 Mon Sep 17 00:00:00 2001
From: Zhao73 <156770117+Zhao73@users.noreply.github.com>
Date: Mon, 1 Jun 2026 20:31:08 +0800
Subject: [PATCH 4/4] docs: fix documentation typos (#10125)

Correct clear spelling mistakes in documentation without changing behavior.

Confidence: high
Scope-risk: narrow
Tested: git diff --check; uvx codespell on changed files
Not-tested: Full docs build not run; text-only changes
Assisted-by: Codex:gpt-5 codespell
---
 docs/content/advanced/advanced-usage.md   | 2 +-
 docs/content/features/image-generation.md | 6 +++---
 docs/content/features/text-generation.md  | 4 ++--
 docs/content/whats-new.md                 | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/content/advanced/advanced-usage.md b/docs/content/advanced/advanced-usage.md
index 7742eb29a874..9b99eba805ac 100644
--- a/docs/content/advanced/advanced-usage.md
+++ b/docs/content/advanced/advanced-usage.md
@@ -273,7 +273,7 @@ A list of the environment variable that tweaks parallelism is the following:
 ```
 ### Python backends GRPC max workers
 ### Default number of workers for GRPC Python backends.
-### This actually controls wether a backend can process multiple requests or not.
+### This actually controls whether a backend can process multiple requests or not.
 
 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
 
diff --git a/docs/content/features/image-generation.md b/docs/content/features/image-generation.md
index e35b7fbf08c2..bb9748dd940e 100644
--- a/docs/content/features/image-generation.md
+++ b/docs/content/features/image-generation.md
@@ -199,7 +199,7 @@ Pipelines types available:
 
 ##### Advanced: Additional parameters
 
-Additional arbitrarly parameters can be specified in the option field in key/value separated by `:`:
+Additional arbitrary parameters can be specified in the option field in key/value separated by `:`:
 
 ```yaml
 name: animagine-xl
@@ -207,7 +207,7 @@ options:
 - "cfg_scale:6"
 ```
 
-**Note**: There is no complete parameter list. Any parameter can be passed arbitrarly and is passed to the model directly as argument to the pipeline. Different pipelines/implementations support different parameters.
+**Note**: There is no complete parameter list. Any parameter can be passed arbitrarily and is passed to the model directly as argument to the pipeline. Different pipelines/implementations support different parameters.
 
 The example above, will result in the following python code when generating images:
 
@@ -342,4 +342,4 @@ diffusers:
 ```bash
 (echo -n '{"prompt": "spiderman surfing","size": "512x512","model":"txt2vid"}') |
 curl -H "Content-Type: application/json" -X POST -d @- http://localhost:8080/v1/images/generations
-```
\ No newline at end of file
+```
diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md
index b39377e73f4d..709fbaf5220e 100644
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -897,7 +897,7 @@ The backend will automatically download the required files in order to run the m
 - `OVModelForCausalLM` requires OpenVINO IR [Text Generation](https://huggingface.co/models?library=openvino&pipeline_tag=text-generation) models from Hugging face
 - `OVModelForFeatureExtraction` works with any Safetensors Transformer [Feature Extraction](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers,safetensors) model from Huggingface (Embedding Model)
 
-Please note that streaming is currently not implemente in `AutoModelForCausalLM` for Intel GPU.
+Please note that streaming is currently not implemented in `AutoModelForCausalLM` for Intel GPU.
 AMD GPU support is not implemented.
 Although AMD CPU is not officially supported by OpenVINO there are reports that it works: YMMV.
 
@@ -1008,4 +1008,4 @@ template:
 
   completion: |
     {{.Input}}
-```
\ No newline at end of file
+```
diff --git a/docs/content/whats-new.md b/docs/content/whats-new.md
index 8a393b4b4e0d..e93fd64838ed 100644
--- a/docs/content/whats-new.md
+++ b/docs/content/whats-new.md
@@ -105,7 +105,7 @@ It is now possible for single-devices with one GPU to specify `--single-active-b
 
 #### Resources management
 
-Thanks to the continous community efforts (another cool contribution from {{< github "dave-gray101" >}} ) now it's possible to shutdown a backend programmatically via the API.
+Thanks to the continuous community efforts (another cool contribution from {{< github "dave-gray101" >}} ) now it's possible to shutdown a backend programmatically via the API.
 There is an ongoing effort in the community to better handling of resources. See also the [🔥Roadmap](https://localai.io/#-hot-topics--roadmap).
 
 #### New how-to section