strausmann
diff --git a/‎README.md‎
Lines changed: 16 additions & 0 deletions b/‎README.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎backend/alembic/versions/20260517_phase7b_datetime_tz.py‎
Lines changed: 47 additions & 0 deletions b/‎backend/alembic/versions/20260517_phase7b_datetime_tz.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎backend/app/api/routes/print.py‎
Lines changed: 2 additions & 1 deletion b/‎backend/app/api/routes/print.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/app/api/routes/printers.py‎
Lines changed: 30 additions & 45 deletions b/‎backend/app/api/routes/printers.py‎
Lines changed: 30 additions & 45 deletions
diff --git a/‎backend/app/db/lifespan.py‎
Lines changed: 126 additions & 3 deletions b/‎backend/app/db/lifespan.py‎
Lines changed: 126 additions & 3 deletions
@@ -92,6 +92,22 @@ curl http://localhost:8080/healthz   # frontend → backend_reachable: true
 | `POST` | `/jobs/{job_id}/resume` | Resume a job paused by tape mismatch (after the user changed the tape physically) | — |
 | `POST` | `/printer/resume` | Resume the printer queue after a recoverable error halted it (tape empty / cover open / offline) | — |
 | `GET` | `/healthz` | Liveness probe for orchestrators | — |
+| `GET` | `/readiness` | Readiness probe — deep check for reverse-proxy routing | — |
+
+### Health Probes
+
+The backend exposes two HTTP probes with different semantics:
+
+| Endpoint | Purpose | What it answers |
+|----------|---------|-----------------|
+| `GET /healthz` | Liveness — Docker / Kubernetes container restart signal | "the process and the event loop are alive" |
+| `GET /readiness` | Readiness — reverse-proxy routing signal | "the process can serve traffic right now": database connectable, alembic at head, templates seeded, runtime printer matches DB, SNMP probe fresh, queue worker alive, SSE bus capacity ok |
+
+`/readiness` returns HTTP 200 with `status` of `ready` (all checks ok) or `degraded` (non-critical checks failing — still routable), and HTTP 503 with `not-ready` when a critical check (database, alembic, template_seed) fails.
+
+Pangolin's `targets[0].healthcheck.path` can use `/readiness` for deep checks instead of `/healthz`; Docker container healthchecks should stay on `/healthz` to avoid restart loops on transient DB failures.
+
+See `docs/superpowers/specs/2026-05-17-phase-7b-foundation-design.md` for the full check list and rationale.
 
 ### `POST /print` request body
 
 
@@ -0,0 +1,47 @@
+"""Phase 7b — normalise existing datetime rows to timezone-aware ISO strings.
+
+Existing rows from Phase 5 inserts contain naive datetimes (no TZ suffix)
+that break the Go frontend's RFC3339 parser. This migration appends
+`+00:00` to any value that does NOT already contain `+` or end with `Z`.
+SQLite is dynamically typed so no ALTER TABLE is required — the new column
+type from B4 only affects new inserts via the SQLAlchemy layer.
+
+Revision ID: 20260517_phase7b_datetime_tz
+Revises: b2668b6e8845
+Create Date: 2026-05-17
+"""
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "20260517_phase7b_datetime_tz"
+down_revision = "b2668b6e8845"
+branch_labels = None
+depends_on = None
+
+
+_TABLES_DT = [
+    ("templates", ["created_at", "updated_at"]),
+    ("printers", ["created_at", "updated_at"]),
+    ("jobs", ["created_at", "updated_at", "started_at", "finished_at"]),
+    ("presets", ["created_at", "updated_at"]),
+    ("printer_state", ["updated_at"]),
+    ("printer_status_cache", ["captured_at", "updated_at"]),
+]
+
+
+def upgrade() -> None:
+    for table, cols in _TABLES_DT:
+        for col in cols:
+            op.execute(
+                f"UPDATE {table} SET {col} = {col} || '+00:00' "
+                f"WHERE {col} IS NOT NULL "
+                f"AND {col} NOT LIKE '%+%' "
+                f"AND {col} NOT LIKE '%Z'"
+            )
+
+
+def downgrade() -> None:
+    # The naive-datetime state being reverted to is exactly the bug we
+    # are fixing. Downgrade is intentionally a no-op.
+    pass
@@ -4,6 +4,7 @@
 
 import logging
 from typing import Any
+from uuid import UUID
 
 from fastapi import APIRouter, HTTPException, Request, status
 from fastapi.responses import JSONResponse
@@ -32,7 +33,7 @@
 class _PrinterResumeResponse(BaseModel):
     """200 response body for POST /printer/resume."""
 
-    printer_id: str
+    printer_id: UUID | str
     state: str
 
 
 
@@ -22,7 +22,6 @@
 
 from __future__ import annotations
 
-import asyncio
 import dataclasses
 import logging
 from datetime import UTC, datetime
@@ -166,65 +165,51 @@ def _error_label(block: Any) -> str | None:
 @router.get(
     "/{printer_id}/status",
     response_model=PrinterStatus,
-    summary="Force a fresh printer status probe",
+    summary="Return the latest cached printer status",
     description=(
-        "Sends an ESC i S command to the printer over TCP/9100.  "
-        "The result is written back to ``printer_status_cache`` and returned. "
-        "Returns 503 when the printer is unreachable."
+        "Returns the most recent status written by the background SNMP probe worker. "
+        "The response is served from ``printer_status_cache`` — no synchronous SNMP "
+        "probe is performed, so the response always returns in <10 ms. "
+        "When no probe has completed yet ``online`` is ``null`` and ``note`` explains why. "
+        "Returns 404 when the printer is not registered."
     ),
 )
 async def get_printer_status(
     printer_id: UUID,
     session: SessionDep,
 ) -> PrinterStatus:
-    """Probe the printer and update the cache."""
-    printer = await _get_printer_or_404(session, printer_id)
+    """Return the latest cached status for a printer; no sync SNMP probe."""
+    await _get_printer_or_404(session, printer_id)
 
-    host: str | None = printer.connection.get("host") if printer.connection else None
-    if not host:
-        raise HTTPException(
-            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            detail=f"printer {printer_id} has no 'host' in connection config",
+    row = await cache_repo.get(session, printer_id)
+    if row is None or row.captured_at is None:
+        return PrinterStatus(
+            printer_id=printer_id,
+            online=None,
+            captured_at=None,
+            note="No probe yet — wait up to 30s for first probe cycle",
         )
 
-    port: int = int(printer.connection.get("port", 9100))
-
-    try:
-        result = await asyncio.to_thread(_probe_status_sync, host, port)
-    except OSError as exc:
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-            detail=f"printer {printer_id} unreachable: {exc}",
-        ) from exc
+    parsed = row.parsed or {}
+    captured = row.captured_at
+    if captured.tzinfo is None:
+        captured = captured.replace(tzinfo=UTC)
+    age_s = int((datetime.now(UTC) - captured).total_seconds())
 
-    block = result["block"]
-    raw: bytes = result["raw"]
-    now = datetime.now(UTC)
-
-    parsed: dict[str, Any] = {
-        "media_width_mm": block.media_width_mm,
-        "media_type": block.media_type.name,
-        "status_type": block.status_type.name,
-        "phase_type": block.phase_type.name,
-        "errors": int(block.errors),
-        "tape_color": block.tape_color.name,
-        "text_color": block.text_color.name,
-    }
+    loaded_tape_mm = parsed.get("loaded_tape_mm")
+    tape_loaded = f"{loaded_tape_mm}mm" if loaded_tape_mm else None
 
-    await cache_repo.upsert(
-        session,
-        printer_id,
-        raw_block=raw,
-        parsed=parsed,
-        captured_at=now,
-    )
+    error_flags = parsed.get("error_flags") or []
+    error_state = ", ".join(error_flags) if error_flags else None
 
     return PrinterStatus(
         printer_id=printer_id,
-        online=True,
-        tape_loaded=_tape_label(block),
-        error_state=_error_label(block),
-        captured_at=now,
+        online=parsed.get("online"),
+        tape_loaded=tape_loaded,
+        error_state=error_state,
+        captured_at=row.captured_at,
+        last_probe_age_s=age_s,
+        last_error=parsed.get("last_error"),
     )
 
 
 
@@ -7,15 +7,28 @@
 
 Call order in main.py lifespan:
     1. run_migrations()          — apply pending Alembic revisions
-    2. recover_inflight_jobs()   — mark stale QUEUED/PRINTING jobs as failed_restart
-    3. seed_templates()          — upsert YAML seed templates into DB
-    4. ensure_printer_state()    — create missing printer_state rows
+    1b. verify_alembic_at_head() — assert DB revision == script head (fail fast)
+    2. _discover_plugins()       — register integration + model plugins (idempotent)
+    3. TemplateLoader.load_dir() — populate in-memory template cache (Cluster 1a)
+    4. recover_inflight_jobs()   — mark stale QUEUED/PRINTING jobs as failed_restart
+    5. seed_templates()          — YAML → DB upsert (defensive check on cache)
+    6. upsert_runtime_printer()  — env → DB Printer row (Cluster 1b)
+    7. ensure_printer_state()    — create missing printer_state rows per Printer
+
+Note: steps 2 and 3 must precede step 5 — TemplateLoader.load_dir() validates
+templates against IntegrationRegistry (populated in step 2), and seed_templates()
+reads from the cache that load_dir() populates in step 3.
 """
 
 from __future__ import annotations
 
+from uuid import UUID
+
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.config import Settings
+from app.models.printer import Printer
+from app.services.printer_identity import derive_printer_id
 from app.services.template_loader import TemplateLoader
 
 
@@ -49,6 +62,55 @@ def _upgrade() -> None:
     await asyncio.to_thread(_upgrade)
 
 
+async def verify_alembic_at_head(settings: Settings) -> None:
+    """Raise RuntimeError if the DB's alembic revision does not match the script head.
+
+    Lifespan calls this right after run_migrations() so a half-applied or
+    corrupted DB fails startup loudly with a clear log line, instead of
+    crashing later inside ORM queries with cryptic schema errors.
+
+    Takes settings explicitly so unit tests can verify against ad-hoc DBs
+    without monkey-patching the get_settings() lru_cache singleton — that's
+    the C2/D2 testability pattern.
+    """
+    import asyncio
+    from pathlib import Path as _Path
+
+    from alembic.config import Config
+    from alembic.runtime.migration import MigrationContext
+    from alembic.script import ScriptDirectory
+    from sqlalchemy import create_engine
+
+    # backend/app/db/lifespan.py → parents[2] = backend/
+    ini_path = _Path(__file__).resolve().parents[2] / "alembic.ini"
+
+    def _check() -> tuple[str | None, str | None]:
+        cfg = Config(str(ini_path))
+        # Prevent alembic from calling logging.config.fileConfig() which would
+        # reconfigure the root logger and break pytest caplog fixtures.
+        cfg.attributes["configure_logger"] = False
+        script = ScriptDirectory.from_config(cfg)
+        head_rev = script.get_current_head()
+
+        # SQLAlchemy's synchronous engine: strip the async driver suffix
+        sync_url = settings.database_url.replace("+aiosqlite", "")
+        engine = create_engine(sync_url)
+        try:
+            with engine.connect() as conn:
+                ctx = MigrationContext.configure(conn)
+                current_rev = ctx.get_current_revision()
+        finally:
+            engine.dispose()
+
+        return current_rev, head_rev
+
+    current_rev, head_rev = await asyncio.to_thread(_check)
+    if current_rev != head_rev:
+        raise RuntimeError(
+            f"Alembic migration drift detected: DB at {current_rev!r}, expected head {head_rev!r}"
+        )
+
+
 async def recover_inflight_jobs(session: AsyncSession) -> int:
     """Mark any QUEUED or PRINTING jobs as FAILED_RESTART.
 
@@ -70,8 +132,16 @@ async def seed_templates(session: AsyncSession, loader: type[TemplateLoader]) ->
     main.py can call by name, and is the natural seam for unit tests that
     want to inject a mock loader without touching the real registry.
 
+    Raises RuntimeError if the loader cache is empty — calling seed_templates
+    without first running TemplateLoader.load_dir() is a lifespan-ordering bug.
+
     Returns the count of rows touched (inserted or updated).
     """
+    if not loader._cache:
+        raise RuntimeError(
+            "seed_templates called with empty TemplateLoader cache — "
+            "TemplateLoader.load_dir() must run before seed_templates()."
+        )
     return await loader.seed_db(session)
 
 
@@ -102,3 +172,56 @@ async def ensure_printer_state(session: AsyncSession) -> int:
         await session.commit()
 
     return created
+
+
+async def upsert_runtime_printer(
+    session: AsyncSession,
+    settings: Settings,
+) -> UUID | None:
+    """Materialise one Printer row from env config; return its deterministic id.
+
+    Returns ``None`` when the environment does NOT declare a printer host
+    (e.g. mock backend in CI).  The lifespan calls this between
+    ``seed_templates`` and ``ensure_printer_state`` so every restart
+    keeps the single runtime printer row consistent with the current env.
+
+    The Printer row is keyed by the deterministic UUIDv5 produced by
+    ``derive_printer_id(model, host, port)`` — the same id that the
+    print-queue driver uses, so the DB row and the in-memory printer share
+    one stable identity across restarts.
+    """
+    model: str = settings.printer_model
+    # Resolve host: pt750w takes precedence, ql820 is the fallback.
+    host: str = settings.pt750w_host or settings.ql820_host or ""
+    port: int = settings.pt750w_port if settings.pt750w_host else settings.ql820_port
+
+    if not (model and host and port):
+        return None
+
+    printer_id: UUID = derive_printer_id(model, host, port)
+    connection: dict[str, object] = {
+        "host": host,
+        "port": port,
+        "snmp": settings.printer_discover_via_snmp,
+        "snmp_community": settings.printer_snmp_community,
+    }
+    name: str = f"{model} ({host})"
+
+    existing = await session.get(Printer, printer_id)
+    if existing is not None:
+        existing.name = name
+        existing.connection = connection
+        existing.enabled = True
+    else:
+        session.add(
+            Printer(
+                id=printer_id,
+                name=name,
+                model=model.lower(),
+                backend=settings.printer_backend,
+                connection=connection,
+                enabled=True,
+            )
+        )
+    await session.flush()
+    return printer_id