Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions apps/cli/src/commands/debrief.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import { join } from 'node:path';
import { loadSettings, resolveDataDir } from '@cavemem/config';
import { Storage } from '@cavemem/storage';
import type { Command } from 'commander';
import kleur from 'kleur';

/**
* Default window: last 24h. The "ran it today" common case. Overridable
* with --hours for zoomed-in post-mortems or multi-day sweeps.
*/
const DEFAULT_HOURS = 24;

interface DebriefContext {
storage: Storage;
since: number;
taskId?: number | undefined;
}

/**
* Section 1 — did agents use the task tools at all?
*
* Signal: ratio of task-thread-tagged observations to total observations
* per session. A session with many observations but zero task-thread
* ones is an agent that memorized nothing from the SessionStart preface.
*/
function sectionToolUsage(ctx: DebriefContext): string[] {
const lines = [kleur.bold('1. Did agents use the task tools?')];
const rows = ctx.storage.toolUsageBySession(ctx.since);
if (rows.length === 0) {
lines.push(kleur.dim(' No activity in the window.'));
return lines;
}
for (const r of rows) {
const ratio = r.total_obs > 0 ? Math.round((r.task_tool_obs / r.total_obs) * 100) : 0;
const marker = ratio >= 10 ? kleur.green('✓') : ratio >= 2 ? kleur.yellow('~') : kleur.red('✗');
lines.push(
` ${marker} ${r.session_id.padEnd(16)} ${r.total_obs} obs, ${r.task_tool_obs} task-tool (${ratio}%)`,
);
}
lines.push(
kleur.dim(
' Interpretation: <2% = tool surface invisible; 2-10% = occasional; >10% = integrated.',
),
);
return lines;
}

/**
* Section 2 — did auto-join land?
*
* We can't directly see what the agent "saw" in its context, but we CAN
* check: for each session that started, did a join event land in
* task_participants within ~2s of session start? If yes, the preface
* generation fired; if no, something broke the auto-join path.
*/
function sectionAutoJoin(ctx: DebriefContext): string[] {
const lines = ['', kleur.bold('2. Did auto-join land?')];
const sessions = ctx.storage
.listSessions(200)
.filter((s) => s.started_at >= ctx.since && s.id !== 'observer');
if (sessions.length === 0) {
lines.push(kleur.dim(' No sessions started in window.'));
return lines;
}
let joined = 0;
let missed = 0;
for (const s of sessions) {
const joinRow = ctx.storage.participantJoinFor(s.id);
if (joinRow && joinRow.joined_at - s.started_at < 2000) {
joined++;
} else {
missed++;
lines.push(` ${kleur.red('✗')} ${s.id} (${s.ide}) started but did not join a task`);
}
}
lines.push(
` ${kleur.green('✓')} ${joined} sessions auto-joined, ${kleur.red(`${missed} missed`)}`,
);
if (missed > 0) {
lines.push(
kleur.dim(
' Missed joins usually mean cwd was outside a git repo or the branch lookup failed.',
),
);
}
return lines;
}

/**
* Section 3 — did agents claim proactively?
*
* The critical diagnostic. Compares edit-observations (tool_use with a
* file_path in metadata) against explicit `claim`-kind observations. If
* claims << edits, proactive claiming is failing → auto-claim's safety
* net is doing the work, which argues for keeping it even if flaky.
*/
function sectionProactiveClaims(ctx: DebriefContext): string[] {
const lines = ['', kleur.bold('3. Did agents claim proactively?')];
const stats = ctx.storage.editVsClaimStats(ctx.since);
lines.push(` Edits observed: ${stats.edit_count}`);
lines.push(` Claims recorded: ${stats.claim_count}`);
const ratio = stats.edit_count > 0 ? Math.round((stats.claim_count / stats.edit_count) * 100) : 0;
const verdict =
ratio >= 70
? kleur.green('proactive claiming works — auto-claim is a safety net, not the main path')
: ratio >= 20
? kleur.yellow('partial claiming — consider sharpening the preface wording')
: kleur.red('proactive claiming failing — auto-claim is carrying the load');
lines.push(` Claim/edit ratio: ${ratio}% → ${verdict}`);
return lines;
}

/**
* Section 4 — handoff outcomes.
*
* Groups handoffs by final status. >30% expiry suggests either a TTL
* that's too short or a receiver-side notification that isn't loud
* enough to land; also reports median accept latency so you can see
* "how fast did the hand-off baton actually pass" empirically.
*/
function sectionHandoffs(ctx: DebriefContext): string[] {
const lines = ['', kleur.bold('4. Handoff outcomes')];
const dist = ctx.storage.handoffStatusDistribution(ctx.since);
const total = dist.accepted + dist.cancelled + dist.expired + dist.pending;
if (total === 0) {
lines.push(kleur.dim(' No handoffs in window.'));
return lines;
}
const pct = (n: number) => `${Math.round((n / total) * 100)}%`;
lines.push(` accepted: ${dist.accepted} (${pct(dist.accepted)})`);
lines.push(` cancelled: ${dist.cancelled} (${pct(dist.cancelled)}) ${kleur.dim('(declined)')}`);
lines.push(` expired: ${dist.expired} (${pct(dist.expired)})`);
lines.push(` pending: ${dist.pending} (${pct(dist.pending)}) ${kleur.dim('(still live)')}`);

const expiryRate = dist.expired / total;
if (expiryRate > 0.3) {
lines.push(
` ${kleur.yellow('⚠')} ${Math.round(expiryRate * 100)}% expiry rate — shorten TTL, sharpen notification, or rethink the default.`,
);
}

const times = ctx.storage.handoffAcceptLatencies(ctx.since);
if (times.length > 0) {
const sorted = [...times].sort((a, b) => a - b);
const median = sorted[Math.floor(sorted.length / 2)] ?? 0;
lines.push(` median time-to-accept: ${Math.round(median / 60_000)}m`);
}
return lines;
}

/**
* Section 5 — interleaved timeline.
*
* No analysis, just chronology. Observer notes are colored magenta so
* you can scan for moments where your note sits next to an agent event —
* those are the coordination failures the numeric sections can't surface.
*/
function sectionTimeline(ctx: DebriefContext): string[] {
const lines = ['', kleur.bold('5. Timeline (observer notes interleaved with agent activity)')];
const events = ctx.storage.mixedTimeline(ctx.since, ctx.taskId);
if (events.length === 0) {
lines.push(kleur.dim(' No events.'));
return lines;
}
for (const e of events) {
const ts = new Date(e.ts).toISOString().slice(11, 19);
const isNote = e.kind === 'observer-note';
const prefix = isNote ? kleur.magenta(' NOTE ') : ` ${e.kind.padEnd(6)}`;
const who = kleur.dim(e.session_id.padEnd(10));
const head = e.content.split('\n')[0]?.slice(0, 70) ?? '';
lines.push(`${kleur.dim(ts)} ${prefix} ${who} ${head}`);
}
return lines;
}

export function registerDebriefCommand(program: Command): void {
program
.command('debrief')
.description('End-of-day collaboration post-mortem: 5 structured sections over DB evidence.')
.option('--hours <n>', 'Window size in hours', String(DEFAULT_HOURS))
.option('--task <id>', 'Narrow the timeline section to a specific task thread')
.action((opts: { hours: string; task?: string }) => {
const settings = loadSettings();
const dbPath = join(resolveDataDir(settings.dataDir), 'data.db');
const storage = new Storage(dbPath);
try {
const ctx: DebriefContext = {
storage,
since: Date.now() - Number(opts.hours) * 3_600_000,
taskId: opts.task ? Number(opts.task) : undefined,
};
const sections = [
sectionToolUsage(ctx),
sectionAutoJoin(ctx),
sectionProactiveClaims(ctx),
sectionHandoffs(ctx),
sectionTimeline(ctx),
];
for (const s of sections) process.stdout.write(`${s.join('\n')}\n`);

// Hard-coded reflection prompts — the debrief's point is to pick
// one concrete next thing, not to admire the data.
process.stdout.write(`\n${kleur.bold('Next-action prompts:')}\n`);
process.stdout.write(' • Was collaboration meaningfully better than no-hivemind?\n');
process.stdout.write(
' • Which failures were missing-tool vs. tool-not-called vs. structural?\n',
);
process.stdout.write(' • What was the most valuable moment the system created?\n');
} finally {
storage.close();
}
});
}
62 changes: 62 additions & 0 deletions apps/cli/src/commands/note.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { join } from 'node:path';
import { loadSettings, resolveDataDir } from '@cavemem/config';
import { MemoryStore } from '@cavemem/core';
import type { Command } from 'commander';
import kleur from 'kleur';

/**
* Reserved session identifier for human scratch notes. Using a fixed id
* (rather than a per-invocation random one) means every note across the
* whole day lives under the same session, which makes "all my notes"
* filters and timeline queries trivial.
*/
const OBSERVER_SESSION_ID = 'observer';

/**
* Idempotently materialise the observer session so the FK from
* observations.session_id holds. `startSession` is `INSERT OR IGNORE`, so
* this is effectively free after the first call.
*/
function ensureObserverSession(store: MemoryStore): void {
store.startSession({
id: OBSERVER_SESSION_ID,
ide: 'observer',
cwd: process.cwd(),
});
}

export function registerNoteCommand(program: Command): void {
program
// Variadic so `cavemem note codex stepped on claude` works without
// quoting. The quoting-every-note friction kills adoption otherwise.
.command('note <text...>')
.description('Record a timestamped scratch note into the memory timeline')
.option('--task <id>', 'Attach this note to a specific task thread (shows up in task_timeline)')
.action(async (words: string[], opts: { task?: string }) => {
const text = words.join(' ').trim();
if (!text) {
process.stderr.write(`${kleur.red('empty note')}\n`);
process.exitCode = 1;
return;
}

const settings = loadSettings();
const dbPath = join(resolveDataDir(settings.dataDir), 'data.db');
const store = new MemoryStore({ dbPath, settings });
try {
ensureObserverSession(store);
const id = store.addObservation({
session_id: OBSERVER_SESSION_ID,
kind: 'observer-note',
content: text,
...(opts.task ? { task_id: Number(opts.task) } : {}),
});
const when = new Date().toISOString().slice(11, 19);
process.stdout.write(
`${kleur.green('✓')} note #${id} at ${when}${opts.task ? ` on task #${opts.task}` : ''}\n`,
);
} finally {
store.close();
}
});
}
Loading
Loading