From 111335c616aac3de4fccf2289bcda504793b7683 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Fri, 24 Apr 2026 22:04:44 +0200 Subject: [PATCH] feat(cli,storage): add colony backfill ide to heal unknown session rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Session rows whose id matches a known IDE prefix but were written before MemoryStore.ensureSession learned to infer the owner still sit in storage with ide='unknown'. The UPSERT in Storage.createSession can auto-upgrade them on the next observation write, but that never fires for sessions that ended without a further write — the row stays unclassified and the worker viewer can never tag the owner. Add Storage.backfillUnknownIde(mapper) as the one place that scans ide='unknown' rows and persists whatever the mapper returns, and wire it into a new `colony backfill ide` subcommand that passes the shared inferIdeFromSessionId helper as the mapper. The operation is idempotent, returns {scanned, updated}, and never invents an owner the mapper cannot classify. Covered by packages/storage/test/storage.test.ts (new case). --- .changeset/backfill-unknown-ide.md | 6 ++++ apps/cli/src/commands/backfill.ts | 40 +++++++++++++++++++++++++++ apps/cli/src/index.ts | 2 ++ packages/storage/src/storage.ts | 32 +++++++++++++++++++++ packages/storage/test/storage.test.ts | 23 +++++++++++++++ 5 files changed, 103 insertions(+) create mode 100644 .changeset/backfill-unknown-ide.md create mode 100644 apps/cli/src/commands/backfill.ts diff --git a/.changeset/backfill-unknown-ide.md b/.changeset/backfill-unknown-ide.md new file mode 100644 index 0000000..eb04849 --- /dev/null +++ b/.changeset/backfill-unknown-ide.md @@ -0,0 +1,6 @@ +--- +"@colony/storage": patch +"@colony/cli": patch +--- + +Add a `colony backfill ide` command that heals session rows whose stored `ide` is `'unknown'` by re-running the shared `inferIdeFromSessionId` helper against the row's session id. This is intended as a one-shot clean-up for databases populated before the hook-side inference learned to handle hyphen-delimited (`codex-...`) and Guardex-branch (`agent//...`) session ids. The underlying `Storage.backfillUnknownIde(mapper)` is idempotent, returns `{ scanned, updated }`, and skips any row the mapper cannot classify so it never invents an owner. diff --git a/apps/cli/src/commands/backfill.ts b/apps/cli/src/commands/backfill.ts new file mode 100644 index 0000000..7b89dbb --- /dev/null +++ b/apps/cli/src/commands/backfill.ts @@ -0,0 +1,40 @@ +import { join } from 'node:path'; +import { loadSettings, resolveDataDir } from '@colony/config'; +import { inferIdeFromSessionId } from '@colony/core'; +import { Storage } from '@colony/storage'; +import type { Command } from 'commander'; + +/** + * `colony backfill ide` heals sessions rows whose stored ide is `'unknown'` + * by re-running the same session-id prefix inference that the hooks now + * apply on write. It exists because ensureSession used to hardcode + * `ide = 'unknown'` for on-demand-materialised rows, which left a long + * trail of orphan `codex-*` and `agent/codex/*` sessions in the viewer. + * + * The command is idempotent: rows that already have a known ide are + * skipped, and re-running it only writes rows the inferrer can actually + * classify. No-op when every row already has a concrete owner. + */ +export function registerBackfillCommand(program: Command): void { + const backfill = program + .command('backfill') + .description('Heal historical rows that predate newer inference logic.'); + + backfill + .command('ide') + .description('Re-infer the ide column for sessions stored as unknown.') + .action(async () => { + const settings = loadSettings(); + const storage = new Storage(join(resolveDataDir(settings.dataDir), 'data.db')); + try { + const { scanned, updated } = storage.backfillUnknownIde((id) => + inferIdeFromSessionId(id), + ); + process.stdout.write( + `backfill ide: scanned=${scanned} updated=${updated} remaining=${scanned - updated}\n`, + ); + } finally { + storage.close(); + } + }); +} diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 4ff02de..c1f19cc 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -2,6 +2,7 @@ import { realpathSync } from 'node:fs'; import { pathToFileURL } from 'node:url'; import { Command } from 'commander'; +import { registerBackfillCommand } from './commands/backfill.js'; import { registerCompressCommands } from './commands/compress.js'; import { registerConfigCommand } from './commands/config.js'; import { registerDebriefCommand } from './commands/debrief.js'; @@ -42,6 +43,7 @@ export function createProgram(): Command { registerExportCommand(program); registerHookCommand(program); registerReindexCommand(program); + registerBackfillCommand(program); registerNoteCommand(program); registerObserveCommand(program); registerDebriefCommand(program); diff --git a/packages/storage/src/storage.ts b/packages/storage/src/storage.ts index 1b9194f..995096f 100644 --- a/packages/storage/src/storage.ts +++ b/packages/storage/src/storage.ts @@ -102,6 +102,38 @@ export class Storage { .all(limit) as SessionRow[]; } + /** + * Walk every session row whose ide is `'unknown'` and, when the caller's + * mapper returns a concrete IDE for that session_id, persist it. Used by + * `colony backfill ide` to heal rows written before the on-demand + * `MemoryStore.ensureSession` learned to infer the owner from the session + * id itself. Returns `{ scanned, updated }` so the CLI can print an + * honest summary instead of pretending every row was touched. + */ + backfillUnknownIde(mapper: (sessionId: string) => string | undefined): { + scanned: number; + updated: number; + } { + const rows = this.db + .prepare("SELECT id FROM sessions WHERE ide = 'unknown'") + .all() as Array<{ id: string }>; + const update = this.db.prepare('UPDATE sessions SET ide = ? WHERE id = ? AND ide = ?'); + let updated = 0; + const tx = this.db.transaction((pending: Array<{ id: string; ide: string }>) => { + for (const row of pending) { + const info = update.run(row.ide, row.id, 'unknown'); + if (info.changes > 0) updated += 1; + } + }); + const pending: Array<{ id: string; ide: string }> = []; + for (const row of rows) { + const next = mapper(row.id); + if (next && next !== 'unknown') pending.push({ id: row.id, ide: next }); + } + tx(pending); + return { scanned: rows.length, updated }; + } + // --- observations --- insertObservation(o: NewObservation): number { diff --git a/packages/storage/test/storage.test.ts b/packages/storage/test/storage.test.ts index 1a078b9..2e8970d 100644 --- a/packages/storage/test/storage.test.ts +++ b/packages/storage/test/storage.test.ts @@ -275,4 +275,27 @@ describe('Storage', () => { expect(storage.countEmbeddings({ model: 'm', dim: 1 })).toBe(1); expect(storage.countEmbeddings({ model: 'm', dim: 2 })).toBe(0); }); + + it('backfillUnknownIde only rewrites rows the mapper can classify', () => { + storage.createSession({ id: 'codex-foo', ide: 'unknown', cwd: null, started_at: 1, metadata: null }); + storage.createSession({ id: 'agent/codex/bar', ide: 'unknown', cwd: null, started_at: 2, metadata: null }); + storage.createSession({ id: 'mystery-slug', ide: 'unknown', cwd: null, started_at: 3, metadata: null }); + storage.createSession({ id: 'known-session', ide: 'claude-code', cwd: null, started_at: 4, metadata: null }); + + const mapper = (id: string): string | undefined => { + if (id.startsWith('codex-')) return 'codex'; + if (id.startsWith('agent/codex/')) return 'codex'; + return undefined; + }; + const result = storage.backfillUnknownIde(mapper); + expect(result).toEqual({ scanned: 3, updated: 2 }); + + expect(storage.getSession('codex-foo')?.ide).toBe('codex'); + expect(storage.getSession('agent/codex/bar')?.ide).toBe('codex'); + expect(storage.getSession('mystery-slug')?.ide).toBe('unknown'); + expect(storage.getSession('known-session')?.ide).toBe('claude-code'); + + // Idempotent: running again should not touch anything. + expect(storage.backfillUnknownIde(mapper)).toEqual({ scanned: 1, updated: 0 }); + }); });