From b9ff48a3667951fc376b5ea3f2f2ed8e2857b35c Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Fri, 24 Apr 2026 00:06:06 +0200 Subject: [PATCH] =?UTF-8?q?feat(cli):=20observe=20+=20debrief=20+=20note?= =?UTF-8?q?=20=E2=80=94=20ambient=20instrumentation=20for=20collaboration?= =?UTF-8?q?=20days?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three commands that make observation ambient. The theory of the experiment is that watching two agents collaborate for a day is worth more than any amount of theorising about which feature to build next, but only if the data accumulates passively while you work and the end-of-day write-up is guided instead of freeform. cavemem note Records a timestamped scratch note under a reserved `observer` session. Variadic argv so quoting doesn't kill adoption: type `cavemem note codex stepped on claude` and it just works. Notes flow through the same observations pipeline as agent activity, so they interleave in task timelines and show up in search. cavemem observe Live dashboard (3s refresh) of active tasks, participants, recent claims, pending handoffs, and the last ~6 events per task. The footer line — "edits without proactive claims (last 5m)" — is the live falsification test for the auto-claim hypothesis: empty means agents are claiming proactively, populated means the safety net is carrying the load. cavemem debrief End-of-day post-mortem with five guided sections: 1. Tool-usage ratio per session (invisible / occasional / integrated) 2. Auto-join landing (sessions that joined within 2s of start) 3. Proactive-claim ratio (claim-kind obs vs tool_use edit count) 4. Handoff outcome distribution + median accept latency 5. Chronological timeline with observer notes interleaved Post-tool-use now records file_path in tool_use metadata when the tool is Edit/Write/MultiEdit/NotebookEdit. The observe/debrief queries depend on this surface — parsing content at query time would require reversing compression; recording at write time is a couple bytes per observation. Storage adds: pendingHandoffs, recentEditsWithoutClaims, toolUsageBySession, participantJoinFor, editVsClaimStats, handoffStatusDistribution, handoffAcceptLatencies, mixedTimeline. All read-only, all single SQLite queries; most use json_extract. All gates green. --- apps/cli/src/commands/debrief.ts | 213 +++++++++++++++++++ apps/cli/src/commands/note.ts | 62 ++++++ apps/cli/src/commands/observe.ts | 155 ++++++++++++++ apps/cli/src/index.ts | 6 + apps/cli/test/program.test.ts | 3 + packages/hooks/src/handlers/post-tool-use.ts | 12 +- packages/hooks/test/runner.test.ts | 5 +- packages/storage/src/storage.ts | 164 ++++++++++++++ 8 files changed, 618 insertions(+), 2 deletions(-) create mode 100644 apps/cli/src/commands/debrief.ts create mode 100644 apps/cli/src/commands/note.ts create mode 100644 apps/cli/src/commands/observe.ts diff --git a/apps/cli/src/commands/debrief.ts b/apps/cli/src/commands/debrief.ts new file mode 100644 index 0000000..042b854 --- /dev/null +++ b/apps/cli/src/commands/debrief.ts @@ -0,0 +1,213 @@ +import { join } from 'node:path'; +import { loadSettings, resolveDataDir } from '@cavemem/config'; +import { Storage } from '@cavemem/storage'; +import type { Command } from 'commander'; +import kleur from 'kleur'; + +/** + * Default window: last 24h. The "ran it today" common case. Overridable + * with --hours for zoomed-in post-mortems or multi-day sweeps. + */ +const DEFAULT_HOURS = 24; + +interface DebriefContext { + storage: Storage; + since: number; + taskId?: number | undefined; +} + +/** + * Section 1 — did agents use the task tools at all? + * + * Signal: ratio of task-thread-tagged observations to total observations + * per session. A session with many observations but zero task-thread + * ones is an agent that memorized nothing from the SessionStart preface. + */ +function sectionToolUsage(ctx: DebriefContext): string[] { + const lines = [kleur.bold('1. Did agents use the task tools?')]; + const rows = ctx.storage.toolUsageBySession(ctx.since); + if (rows.length === 0) { + lines.push(kleur.dim(' No activity in the window.')); + return lines; + } + for (const r of rows) { + const ratio = r.total_obs > 0 ? Math.round((r.task_tool_obs / r.total_obs) * 100) : 0; + const marker = ratio >= 10 ? kleur.green('✓') : ratio >= 2 ? kleur.yellow('~') : kleur.red('✗'); + lines.push( + ` ${marker} ${r.session_id.padEnd(16)} ${r.total_obs} obs, ${r.task_tool_obs} task-tool (${ratio}%)`, + ); + } + lines.push( + kleur.dim( + ' Interpretation: <2% = tool surface invisible; 2-10% = occasional; >10% = integrated.', + ), + ); + return lines; +} + +/** + * Section 2 — did auto-join land? + * + * We can't directly see what the agent "saw" in its context, but we CAN + * check: for each session that started, did a join event land in + * task_participants within ~2s of session start? If yes, the preface + * generation fired; if no, something broke the auto-join path. + */ +function sectionAutoJoin(ctx: DebriefContext): string[] { + const lines = ['', kleur.bold('2. Did auto-join land?')]; + const sessions = ctx.storage + .listSessions(200) + .filter((s) => s.started_at >= ctx.since && s.id !== 'observer'); + if (sessions.length === 0) { + lines.push(kleur.dim(' No sessions started in window.')); + return lines; + } + let joined = 0; + let missed = 0; + for (const s of sessions) { + const joinRow = ctx.storage.participantJoinFor(s.id); + if (joinRow && joinRow.joined_at - s.started_at < 2000) { + joined++; + } else { + missed++; + lines.push(` ${kleur.red('✗')} ${s.id} (${s.ide}) started but did not join a task`); + } + } + lines.push( + ` ${kleur.green('✓')} ${joined} sessions auto-joined, ${kleur.red(`${missed} missed`)}`, + ); + if (missed > 0) { + lines.push( + kleur.dim( + ' Missed joins usually mean cwd was outside a git repo or the branch lookup failed.', + ), + ); + } + return lines; +} + +/** + * Section 3 — did agents claim proactively? + * + * The critical diagnostic. Compares edit-observations (tool_use with a + * file_path in metadata) against explicit `claim`-kind observations. If + * claims << edits, proactive claiming is failing → auto-claim's safety + * net is doing the work, which argues for keeping it even if flaky. + */ +function sectionProactiveClaims(ctx: DebriefContext): string[] { + const lines = ['', kleur.bold('3. Did agents claim proactively?')]; + const stats = ctx.storage.editVsClaimStats(ctx.since); + lines.push(` Edits observed: ${stats.edit_count}`); + lines.push(` Claims recorded: ${stats.claim_count}`); + const ratio = stats.edit_count > 0 ? Math.round((stats.claim_count / stats.edit_count) * 100) : 0; + const verdict = + ratio >= 70 + ? kleur.green('proactive claiming works — auto-claim is a safety net, not the main path') + : ratio >= 20 + ? kleur.yellow('partial claiming — consider sharpening the preface wording') + : kleur.red('proactive claiming failing — auto-claim is carrying the load'); + lines.push(` Claim/edit ratio: ${ratio}% → ${verdict}`); + return lines; +} + +/** + * Section 4 — handoff outcomes. + * + * Groups handoffs by final status. >30% expiry suggests either a TTL + * that's too short or a receiver-side notification that isn't loud + * enough to land; also reports median accept latency so you can see + * "how fast did the hand-off baton actually pass" empirically. + */ +function sectionHandoffs(ctx: DebriefContext): string[] { + const lines = ['', kleur.bold('4. Handoff outcomes')]; + const dist = ctx.storage.handoffStatusDistribution(ctx.since); + const total = dist.accepted + dist.cancelled + dist.expired + dist.pending; + if (total === 0) { + lines.push(kleur.dim(' No handoffs in window.')); + return lines; + } + const pct = (n: number) => `${Math.round((n / total) * 100)}%`; + lines.push(` accepted: ${dist.accepted} (${pct(dist.accepted)})`); + lines.push(` cancelled: ${dist.cancelled} (${pct(dist.cancelled)}) ${kleur.dim('(declined)')}`); + lines.push(` expired: ${dist.expired} (${pct(dist.expired)})`); + lines.push(` pending: ${dist.pending} (${pct(dist.pending)}) ${kleur.dim('(still live)')}`); + + const expiryRate = dist.expired / total; + if (expiryRate > 0.3) { + lines.push( + ` ${kleur.yellow('⚠')} ${Math.round(expiryRate * 100)}% expiry rate — shorten TTL, sharpen notification, or rethink the default.`, + ); + } + + const times = ctx.storage.handoffAcceptLatencies(ctx.since); + if (times.length > 0) { + const sorted = [...times].sort((a, b) => a - b); + const median = sorted[Math.floor(sorted.length / 2)] ?? 0; + lines.push(` median time-to-accept: ${Math.round(median / 60_000)}m`); + } + return lines; +} + +/** + * Section 5 — interleaved timeline. + * + * No analysis, just chronology. Observer notes are colored magenta so + * you can scan for moments where your note sits next to an agent event — + * those are the coordination failures the numeric sections can't surface. + */ +function sectionTimeline(ctx: DebriefContext): string[] { + const lines = ['', kleur.bold('5. Timeline (observer notes interleaved with agent activity)')]; + const events = ctx.storage.mixedTimeline(ctx.since, ctx.taskId); + if (events.length === 0) { + lines.push(kleur.dim(' No events.')); + return lines; + } + for (const e of events) { + const ts = new Date(e.ts).toISOString().slice(11, 19); + const isNote = e.kind === 'observer-note'; + const prefix = isNote ? kleur.magenta(' NOTE ') : ` ${e.kind.padEnd(6)}`; + const who = kleur.dim(e.session_id.padEnd(10)); + const head = e.content.split('\n')[0]?.slice(0, 70) ?? ''; + lines.push(`${kleur.dim(ts)} ${prefix} ${who} ${head}`); + } + return lines; +} + +export function registerDebriefCommand(program: Command): void { + program + .command('debrief') + .description('End-of-day collaboration post-mortem: 5 structured sections over DB evidence.') + .option('--hours ', 'Window size in hours', String(DEFAULT_HOURS)) + .option('--task ', 'Narrow the timeline section to a specific task thread') + .action((opts: { hours: string; task?: string }) => { + const settings = loadSettings(); + const dbPath = join(resolveDataDir(settings.dataDir), 'data.db'); + const storage = new Storage(dbPath); + try { + const ctx: DebriefContext = { + storage, + since: Date.now() - Number(opts.hours) * 3_600_000, + taskId: opts.task ? Number(opts.task) : undefined, + }; + const sections = [ + sectionToolUsage(ctx), + sectionAutoJoin(ctx), + sectionProactiveClaims(ctx), + sectionHandoffs(ctx), + sectionTimeline(ctx), + ]; + for (const s of sections) process.stdout.write(`${s.join('\n')}\n`); + + // Hard-coded reflection prompts — the debrief's point is to pick + // one concrete next thing, not to admire the data. + process.stdout.write(`\n${kleur.bold('Next-action prompts:')}\n`); + process.stdout.write(' • Was collaboration meaningfully better than no-hivemind?\n'); + process.stdout.write( + ' • Which failures were missing-tool vs. tool-not-called vs. structural?\n', + ); + process.stdout.write(' • What was the most valuable moment the system created?\n'); + } finally { + storage.close(); + } + }); +} diff --git a/apps/cli/src/commands/note.ts b/apps/cli/src/commands/note.ts new file mode 100644 index 0000000..3d2d195 --- /dev/null +++ b/apps/cli/src/commands/note.ts @@ -0,0 +1,62 @@ +import { join } from 'node:path'; +import { loadSettings, resolveDataDir } from '@cavemem/config'; +import { MemoryStore } from '@cavemem/core'; +import type { Command } from 'commander'; +import kleur from 'kleur'; + +/** + * Reserved session identifier for human scratch notes. Using a fixed id + * (rather than a per-invocation random one) means every note across the + * whole day lives under the same session, which makes "all my notes" + * filters and timeline queries trivial. + */ +const OBSERVER_SESSION_ID = 'observer'; + +/** + * Idempotently materialise the observer session so the FK from + * observations.session_id holds. `startSession` is `INSERT OR IGNORE`, so + * this is effectively free after the first call. + */ +function ensureObserverSession(store: MemoryStore): void { + store.startSession({ + id: OBSERVER_SESSION_ID, + ide: 'observer', + cwd: process.cwd(), + }); +} + +export function registerNoteCommand(program: Command): void { + program + // Variadic so `cavemem note codex stepped on claude` works without + // quoting. The quoting-every-note friction kills adoption otherwise. + .command('note ') + .description('Record a timestamped scratch note into the memory timeline') + .option('--task ', 'Attach this note to a specific task thread (shows up in task_timeline)') + .action(async (words: string[], opts: { task?: string }) => { + const text = words.join(' ').trim(); + if (!text) { + process.stderr.write(`${kleur.red('empty note')}\n`); + process.exitCode = 1; + return; + } + + const settings = loadSettings(); + const dbPath = join(resolveDataDir(settings.dataDir), 'data.db'); + const store = new MemoryStore({ dbPath, settings }); + try { + ensureObserverSession(store); + const id = store.addObservation({ + session_id: OBSERVER_SESSION_ID, + kind: 'observer-note', + content: text, + ...(opts.task ? { task_id: Number(opts.task) } : {}), + }); + const when = new Date().toISOString().slice(11, 19); + process.stdout.write( + `${kleur.green('✓')} note #${id} at ${when}${opts.task ? ` on task #${opts.task}` : ''}\n`, + ); + } finally { + store.close(); + } + }); +} diff --git a/apps/cli/src/commands/observe.ts b/apps/cli/src/commands/observe.ts new file mode 100644 index 0000000..ae5345d --- /dev/null +++ b/apps/cli/src/commands/observe.ts @@ -0,0 +1,155 @@ +import { join } from 'node:path'; +import { loadSettings, resolveDataDir } from '@cavemem/config'; +import { Storage } from '@cavemem/storage'; +import type { Command } from 'commander'; +import kleur from 'kleur'; + +/** + * Refresh cadence. Three seconds is a compromise: fast enough that new + * claims show up while you're still looking at the screen, slow enough + * that the redraw flicker isn't distracting in peripheral vision. + */ +const REFRESH_MS = 3000; + +/** + * "Recent" window for the unclaimed-edits diagnostic. Five minutes + * matches the conflict-warning window used by `UserPromptSubmit` — + * keeping them aligned makes the mental math on both tools align. + */ +const RECENT_WINDOW_MS = 5 * 60_000; + +function fmtAgo(ts: number): string { + const ms = Date.now() - ts; + if (ms < 60_000) return `${Math.round(ms / 1000)}s ago`; + if (ms < 3_600_000) return `${Math.round(ms / 60_000)}m ago`; + return `${Math.round(ms / 3_600_000)}h ago`; +} + +/** + * Paint one frame. Extracted so the setInterval loop stays one line + * and so the renderer is easy to invoke from a future snapshot test. + */ +function renderFrame(storage: Storage): string { + const lines: string[] = []; + const now = new Date().toISOString().slice(11, 19); + lines.push(`${kleur.bold('cavemem observe')} ${kleur.dim(now)}`); + lines.push(kleur.dim('─'.repeat(60))); + + const tasks = storage.listTasks(5); + if (tasks.length === 0) { + lines.push(kleur.dim('No tasks yet. Start a session in a git repo to auto-create one.')); + return lines.join('\n'); + } + + for (const task of tasks) { + lines.push(''); + lines.push( + `${kleur.cyan(`task #${task.id}`)} ${kleur.bold(task.branch)} ${kleur.dim(task.repo_root)}`, + ); + + const participants = storage.listParticipants(task.id); + const participantLine = participants + .map((p) => `${p.agent} (${fmtAgo(p.joined_at)})`) + .join(', '); + lines.push(` ${kleur.dim('participants:')} ${participantLine || 'none'}`); + + const claims = storage.recentClaims(task.id, Date.now() - RECENT_WINDOW_MS); + if (claims.length > 0) { + lines.push(` ${kleur.dim('claims:')}`); + for (const c of claims) { + lines.push( + ` ${c.file_path.padEnd(40)} ${kleur.yellow(c.session_id.padEnd(10))} ${fmtAgo(c.claimed_at)}`, + ); + } + } + + const pending = storage.pendingHandoffs(task.id); + if (pending.length > 0) { + lines.push(` ${kleur.dim('pending handoffs:')}`); + for (const h of pending) { + const meta = safeJson(h.metadata) as { + from_agent?: string; + to_agent?: string; + summary?: string; + }; + const summary = (meta.summary ?? '').slice(0, 50); + lines.push(` #${h.id} ${meta.from_agent ?? '?'} → ${meta.to_agent ?? '?'}: ${summary}`); + } + } + + const recent = storage.taskTimeline(task.id, 6); + if (recent.length > 0) { + lines.push(` ${kleur.dim('recent:')}`); + // taskTimeline is DESC — reverse so the most recent line is last, + // matching "read top-down as a chronological stream". + for (const r of [...recent].reverse()) { + const ts = new Date(r.ts).toISOString().slice(11, 19); + const kindColor = r.kind === 'observer-note' ? kleur.magenta : kleur.cyan; + lines.push( + ` ${kleur.dim(ts)} ${kindColor(r.kind.padEnd(15))} ${r.content.slice(0, 48)}`, + ); + } + } + } + + // Diagnostic footer — the single most valuable piece of the dashboard, + // placed last where it doesn't scroll off. Only counts edits that lack + // an explicit `claim`-kind observation; the auto-claim side effect is + // deliberately not credited here, because the point of this diagnostic + // is to measure *proactive* behaviour. + lines.push(''); + const unclaimed = storage.recentEditsWithoutClaims(Date.now() - RECENT_WINDOW_MS); + if (unclaimed.length === 0) { + lines.push(kleur.green('edits without proactive claims (last 5m): none')); + } else { + lines.push(kleur.yellow(`edits without proactive claims (last 5m): ${unclaimed.length}`)); + for (const e of unclaimed.slice(0, 10)) { + lines.push(` ${e.file_path} ${kleur.dim(`(${e.session_id}, ${fmtAgo(e.ts)})`)}`); + } + } + + return lines.join('\n'); +} + +function safeJson(s: string | null): Record { + if (!s) return {}; + try { + const v = JSON.parse(s) as unknown; + return v && typeof v === 'object' ? (v as Record) : {}; + } catch { + return {}; + } +} + +export function registerObserveCommand(program: Command): void { + program + .command('observe') + .description('Live dashboard of collaboration state. Run in a spare terminal during a session.') + .option('--interval ', 'Refresh interval in milliseconds', String(REFRESH_MS)) + .action((opts: { interval: string }) => { + const settings = loadSettings(); + const dbPath = join(resolveDataDir(settings.dataDir), 'data.db'); + const storage = new Storage(dbPath); + const intervalMs = Math.max(500, Number(opts.interval)); + + // \x1b[2J clears the screen; \x1b[H sends the cursor home. Minimal + // cross-platform approach — avoids heavyweight `blessed`/`ink` deps + // for what is ultimately a glorified printf loop. + const paint = () => { + process.stdout.write('\x1b[2J\x1b[H'); + process.stdout.write(renderFrame(storage)); + process.stdout.write(`\n\n${kleur.dim(`refresh ${intervalMs}ms · ctrl-c to exit`)}\n`); + }; + + paint(); + const handle = setInterval(paint, intervalMs); + + const stop = () => { + clearInterval(handle); + storage.close(); + process.exit(0); + }; + process.on('SIGINT', stop); + process.on('SIGTERM', stop); + }); +} diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 35b414f..0753aec 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -4,12 +4,15 @@ import { pathToFileURL } from 'node:url'; import { Command } from 'commander'; import { registerCompressCommands } from './commands/compress.js'; import { registerConfigCommand } from './commands/config.js'; +import { registerDebriefCommand } from './commands/debrief.js'; import { registerDoctorCommand } from './commands/doctor.js'; import { registerExportCommand } from './commands/export.js'; import { registerHookCommand } from './commands/hook.js'; import { registerInstallCommand } from './commands/install.js'; import { registerLifecycleCommands } from './commands/lifecycle.js'; import { registerMcpCommand } from './commands/mcp.js'; +import { registerNoteCommand } from './commands/note.js'; +import { registerObserveCommand } from './commands/observe.js'; import { registerReindexCommand } from './commands/reindex.js'; import { registerSearchCommand } from './commands/search.js'; import { registerStatusCommand } from './commands/status.js'; @@ -37,6 +40,9 @@ export function createProgram(): Command { registerExportCommand(program); registerHookCommand(program); registerReindexCommand(program); + registerNoteCommand(program); + registerObserveCommand(program); + registerDebriefCommand(program); return program; } diff --git a/apps/cli/test/program.test.ts b/apps/cli/test/program.test.ts index f82cdb2..b1cf1c7 100644 --- a/apps/cli/test/program.test.ts +++ b/apps/cli/test/program.test.ts @@ -8,6 +8,7 @@ describe('cavemem CLI program', () => { const expected = [ 'compress', 'config', + 'debrief', 'doctor', 'expand', 'export', @@ -15,6 +16,8 @@ describe('cavemem CLI program', () => { 'import', 'install', 'mcp', + 'note', + 'observe', 'reindex', 'restart', 'search', diff --git a/packages/hooks/src/handlers/post-tool-use.ts b/packages/hooks/src/handlers/post-tool-use.ts index 253c703..230ca50 100644 --- a/packages/hooks/src/handlers/post-tool-use.ts +++ b/packages/hooks/src/handlers/post-tool-use.ts @@ -27,11 +27,21 @@ export async function postToolUse(store: MemoryStore, input: HookInput): Promise 4000, ); if (!body.trim()) return; + + // Capture touched files in the observation metadata. Parsing content for + // file_path later would require reversing compression — cheap to record + // at write time, expensive to recover at query time. The `observe` and + // `debrief` commands both depend on this surface for edit-vs-claim + // diagnostics, so we pay the tiny write cost unconditionally. + const touchedFiles = extractTouchedFiles(tool, toolInput); + const metadata: Record = { tool }; + if (touchedFiles.length > 0) metadata.file_path = touchedFiles[0]; + store.addObservation({ session_id: input.session_id, kind: 'tool_use', content: body, - metadata: { tool }, + metadata, }); // Side effect: record a claim for every file this tool edited. Observed diff --git a/packages/hooks/test/runner.test.ts b/packages/hooks/test/runner.test.ts index cd653d2..95a7a87 100644 --- a/packages/hooks/test/runner.test.ts +++ b/packages/hooks/test/runner.test.ts @@ -177,7 +177,10 @@ describe('runHook', () => { expect(r.ok).toBe(true); const tl = store.timeline('sess-cc'); expect(tl).toHaveLength(1); - expect(tl[0]?.metadata).toEqual({ tool: 'Edit' }); + // Edit + file_path: the handler now records the touched file path in + // metadata so observe/debrief can correlate edits with claims without + // re-parsing the content field. + expect(tl[0]?.metadata).toEqual({ tool: 'Edit', file_path: '/tmp/x.txt' }); expect(tl[0]?.content).toContain('Edit'); }); diff --git a/packages/storage/src/storage.ts b/packages/storage/src/storage.ts index a532a30..8fb7b24 100644 --- a/packages/storage/src/storage.ts +++ b/packages/storage/src/storage.ts @@ -472,6 +472,170 @@ export class Storage { transaction(fn: () => T): T { return this.db.transaction(fn)(); } + + // --- observe / debrief analytics --- + // + // These are read-heavy queries serving the CLI dashboards. They stay on + // the Storage class (not a separate analytics module) because they work + // on the same prepared-statement cache and benefit from colocation with + // the tables they query. + + /** Pending, non-expired handoffs on a task. */ + pendingHandoffs(task_id: number): ObservationRow[] { + return this.db + .prepare( + `SELECT * FROM observations + WHERE task_id = ? AND kind = 'handoff' + AND json_extract(metadata, '$.status') = 'pending' + ORDER BY ts DESC LIMIT 50`, + ) + .all(task_id) as ObservationRow[]; + } + + /** + * Recent write-tool observations whose file wasn't explicitly claimed by + * the agent that edited it. "Claimed" here means an explicit `claim`-kind + * observation — not the auto-claim side effect — so the query measures + * proactive behavior, not the automatic safety net. + */ + recentEditsWithoutClaims( + since_ts: number, + limit = 20, + ): Array<{ session_id: string; file_path: string; ts: number; task_id: number | null }> { + return this.db + .prepare( + `SELECT o.session_id, + json_extract(o.metadata, '$.file_path') AS file_path, + o.ts, + o.task_id + FROM observations o + WHERE o.kind = 'tool_use' + AND o.ts > ? + AND json_extract(o.metadata, '$.file_path') IS NOT NULL + AND NOT EXISTS ( + SELECT 1 FROM observations c + WHERE c.kind = 'claim' + AND c.session_id = o.session_id + AND json_extract(c.metadata, '$.file_path') = json_extract(o.metadata, '$.file_path') + AND c.ts <= o.ts + ) + ORDER BY o.ts DESC + LIMIT ?`, + ) + .all(since_ts, limit) as Array<{ + session_id: string; + file_path: string; + ts: number; + task_id: number | null; + }>; + } + + /** Per-session activity since `since_ts`, split into total observations + * and task-thread-tagged observations. Ratio is the debrief's first + * signal of whether an agent found the tools at all. */ + toolUsageBySession( + since_ts: number, + ): Array<{ session_id: string; total_obs: number; task_tool_obs: number }> { + return this.db + .prepare( + `SELECT + session_id, + COUNT(*) AS total_obs, + SUM(CASE WHEN task_id IS NOT NULL THEN 1 ELSE 0 END) AS task_tool_obs + FROM observations + WHERE ts > ? AND session_id != 'observer' + GROUP BY session_id + ORDER BY total_obs DESC`, + ) + .all(since_ts) as Array<{ + session_id: string; + total_obs: number; + task_tool_obs: number; + }>; + } + + /** First task-participant row for a session, used to verify auto-join + * fired within ~2s of SessionStart. */ + participantJoinFor(session_id: string): TaskParticipantRow | undefined { + return this.db + .prepare( + 'SELECT * FROM task_participants WHERE session_id = ? ORDER BY joined_at ASC LIMIT 1', + ) + .get(session_id) as TaskParticipantRow | undefined; + } + + /** Edit count vs explicit-claim count — the critical diagnostic for + * whether proactive claiming is working in the wild. */ + editVsClaimStats(since_ts: number): { edit_count: number; claim_count: number } { + const edit = this.db + .prepare( + `SELECT COUNT(*) AS n FROM observations + WHERE ts > ? AND kind = 'tool_use' + AND json_extract(metadata, '$.file_path') IS NOT NULL`, + ) + .get(since_ts) as { n: number }; + const claim = this.db + .prepare("SELECT COUNT(*) AS n FROM observations WHERE ts > ? AND kind = 'claim'") + .get(since_ts) as { n: number }; + return { edit_count: edit.n, claim_count: claim.n }; + } + + /** Count of handoffs by final status in the window. */ + handoffStatusDistribution(since_ts: number): { + accepted: number; + cancelled: number; + expired: number; + pending: number; + } { + const row = this.db + .prepare( + `SELECT + SUM(CASE WHEN json_extract(metadata, '$.status') = 'accepted' THEN 1 ELSE 0 END) AS accepted, + SUM(CASE WHEN json_extract(metadata, '$.status') = 'cancelled' THEN 1 ELSE 0 END) AS cancelled, + SUM(CASE WHEN json_extract(metadata, '$.status') = 'expired' THEN 1 ELSE 0 END) AS expired, + SUM(CASE WHEN json_extract(metadata, '$.status') = 'pending' THEN 1 ELSE 0 END) AS pending + FROM observations WHERE ts > ? AND kind = 'handoff'`, + ) + .get(since_ts) as { + accepted: number | null; + cancelled: number | null; + expired: number | null; + pending: number | null; + }; + return { + accepted: row.accepted ?? 0, + cancelled: row.cancelled ?? 0, + expired: row.expired ?? 0, + pending: row.pending ?? 0, + }; + } + + /** Milliseconds between handoff post and accept, for accepted handoffs. */ + handoffAcceptLatencies(since_ts: number): number[] { + const rows = this.db + .prepare( + `SELECT (json_extract(metadata, '$.accepted_at') - ts) AS latency_ms + FROM observations + WHERE ts > ? AND kind = 'handoff' + AND json_extract(metadata, '$.status') = 'accepted' + AND json_extract(metadata, '$.accepted_at') IS NOT NULL`, + ) + .all(since_ts) as Array<{ latency_ms: number }>; + return rows.map((r) => r.latency_ms).filter((n) => Number.isFinite(n) && n >= 0); + } + + /** Mixed-source timeline (agent activity + observer notes) ordered + * oldest → newest so the debrief reads as a chronological story. */ + mixedTimeline(since_ts: number, task_id?: number, limit = 200): ObservationRow[] { + if (task_id !== undefined) { + return this.db + .prepare('SELECT * FROM observations WHERE ts > ? AND task_id = ? ORDER BY ts ASC LIMIT ?') + .all(since_ts, task_id, limit) as ObservationRow[]; + } + return this.db + .prepare('SELECT * FROM observations WHERE ts > ? ORDER BY ts ASC LIMIT ?') + .all(since_ts, limit) as ObservationRow[]; + } } function sanitizeMatch(q: string): string {