Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .changeset/foraging-indexer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
---
"@colony/foraging": minor
"@colony/storage": minor
"@colony/config": minor
---

Add the foraging indexer and a storage-aware `scanExamples` wrapper.

`indexFoodSource(food, store, opts)` converts a discovered `FoodSource`
into 1–N `foraged-pattern` observations (manifest, README,
entrypoints, filetree), scrubs env-assignment secrets through
`redact`, and persists via `MemoryStore` so compression and the
`<private>` tag stripper both run on the write path.

`scanExamples({ repo_root, store, session_id, limits?, extra_secret_env_names? })`
walks `<repo_root>/examples/*`, compares each discovered source's
`content_hash` against `storage.getExample(...)`, and only re-indexes
when the hash has shifted. Before re-indexing it calls the new
`Storage.deleteForagedObservations(repo_root, example_name)` so the
observation set never duplicates across scans.

Two helpers on `Storage` to let the indexer (and the forthcoming MCP
tool) work without opening the DB themselves:

- `deleteForagedObservations(repo_root, example_name): number`
- `listForagedObservations(repo_root, example_name): ObservationRow[]`

New `settings.foraging` block (defaults: enabled, `maxDepth: 2`,
`maxFileBytes: 200_000`, `maxFilesPerSource: 50`,
`scanOnSessionStart: true`, `extraSecretEnvNames: []`). `colony config
show` and `settingsDocs()` pick it up automatically.

No MCP tools, CLI commands, or hook wiring yet — those arrive in the
next PR.
43 changes: 43 additions & 0 deletions packages/config/src/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,49 @@ export const SettingsSchema = z
.record(z.string(), z.boolean())
.default({})
.describe('Installed IDE integrations (set by `colony install`).'),
foraging: z
.object({
enabled: z
.boolean()
.default(true)
.describe('Auto-index <repo_root>/examples food sources on SessionStart.'),
maxDepth: z
.number()
.int()
.positive()
.max(5)
.default(2)
.describe('How deep to walk into each example directory.'),
maxFileBytes: z
.number()
.int()
.positive()
.default(200_000)
.describe('Truncate indexed files larger than this.'),
maxFilesPerSource: z
.number()
.int()
.positive()
.default(50)
.describe('Stop indexing after this many files per example.'),
scanOnSessionStart: z
.boolean()
.default(true)
.describe('Fire-and-forget the scanner when SessionStart fires.'),
extraSecretEnvNames: z
.array(z.string())
.default([])
.describe('Additional env-var names to treat as secrets during redaction.'),
})
.default({
enabled: true,
maxDepth: 2,
maxFileBytes: 200_000,
maxFilesPerSource: 50,
scanOnSessionStart: true,
extraSecretEnvNames: [],
})
.describe('Foraging: turn <repo_root>/examples into a reusable food source.'),
})
.strict();

Expand Down
5 changes: 5 additions & 0 deletions packages/foraging/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
"test": "vitest run",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"@colony/config": "workspace:*",
"@colony/core": "workspace:*",
"@colony/storage": "workspace:*"
},
"devDependencies": {
"tsup": "^8.3.5",
"typescript": "^5.6.3",
Expand Down
6 changes: 4 additions & 2 deletions packages/foraging/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
export { scanExamplesFs } from './scanner.js';
export type { ScanFsOptions, ScanFsResult } from './scanner.js';
export { scanExamples, scanExamplesFs } from './scanner.js';
export type { ScanFsOptions, ScanFsResult, ScanOptions } from './scanner.js';
export { extract, readCapped } from './extractor.js';
export type { ExtractedShape } from './extractor.js';
export { indexFoodSource } from './indexer.js';
export type { IndexFoodSourceOptions } from './indexer.js';
export { redact } from './redact.js';
export type {
ExampleManifestKind,
Expand Down
169 changes: 169 additions & 0 deletions packages/foraging/src/indexer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import { type Stats, readdirSync, statSync } from 'node:fs';
import { join, relative } from 'node:path';
import type { MemoryStore } from '@colony/core';
import { readCapped } from './extractor.js';
import { redact } from './redact.js';
import { DEFAULT_SCAN_LIMITS, type FoodSource, type ForagedPattern } from './types.js';

export interface IndexFoodSourceOptions {
/** Session id that owns the foraged observations (scanner spawns one). */
session_id: string;
max_file_bytes?: number;
extra_secret_env_names?: readonly string[];
}

/**
* Convert a discovered food source into 1–N `foraged-pattern`
* observations and persist them via `MemoryStore`. Returns the number
* of observations actually written.
*
* The function assumes the caller has already cleared stale observations
* for this (repo_root, example_name) — see `deleteForagedObservations`
* on `Storage`. Not clearing here lets the caller distinguish "same
* source, re-indexing" from "new source, first scan" in test assertions.
*/
export function indexFoodSource(
food: FoodSource,
store: MemoryStore,
opts: IndexFoodSourceOptions,
): number {
const maxBytes = opts.max_file_bytes ?? DEFAULT_SCAN_LIMITS.max_file_bytes;
const patterns = buildPatterns(food, maxBytes);

let written = 0;
for (const p of patterns) {
const safe = redact(p.content, opts.extra_secret_env_names ?? []);
if (!safe.trim()) continue;
const id = store.addObservation({
session_id: opts.session_id,
kind: 'foraged-pattern',
content: safe,
metadata: {
repo_root: food.repo_root,
example_name: food.example_name,
manifest_kind: food.manifest_kind,
file_path: p.file_path,
entry_kind: p.entry_kind,
},
});
if (id > 0) written += 1;
}
return written;
}

/**
* Emit patterns in a stable order so the indexed observations sit in a
* predictable sequence: manifest first (highest signal for
* integration), README next (human prose with usage examples),
* entrypoints after (canonical call sites), filetree last (tail
* context).
*/
function buildPatterns(food: FoodSource, maxBytes: number): ForagedPattern[] {
const out: ForagedPattern[] = [];

if (food.manifest_path) {
const text = readCapped(join(food.abs_path, food.manifest_path), maxBytes);
if (text !== null) {
out.push({
example_name: food.example_name,
file_path: food.manifest_path,
entry_kind: 'manifest',
content: text,
});
}
}

if (food.readme_path) {
const text = readCapped(join(food.abs_path, food.readme_path), maxBytes);
if (text !== null) {
out.push({
example_name: food.example_name,
file_path: food.readme_path,
entry_kind: 'readme',
content: text,
});
}
}

for (const ep of food.entrypoints) {
const text = readCapped(join(food.abs_path, ep), maxBytes);
if (text === null) continue;
out.push({
example_name: food.example_name,
file_path: ep,
entry_kind: 'entrypoint',
content: text,
});
}

const tree = renderFiletree(food.abs_path);
if (tree) {
out.push({
example_name: food.example_name,
file_path: '__filetree__',
entry_kind: 'filetree',
content: tree,
});
}

return out;
}

/**
* Render a small, sorted two-line-per-dir outline of the example.
* Deliberately flat — deep directory trees get truncated by the caller
* (`max_files_per_source` on the scanner). The output is human-readable
* so when an agent calls `get_observations(ids[])` on a filetree
* observation they see something they can reason about.
*/
function renderFiletree(abs_path: string): string {
const lines: string[] = [];
const seenDirs = new Set<string>();

function visit(dir: string, depth: number): void {
if (depth > 3 || lines.length > 200) return;
let entries: string[];
try {
entries = readdirSync(dir).sort();
} catch {
return;
}
for (const name of entries) {
if (SKIP_NAMES.has(name)) continue;
const abs = join(dir, name);
let st: Stats;
try {
st = statSync(abs);
} catch {
continue;
}
const rel = relative(abs_path, abs);
if (st.isDirectory()) {
if (!seenDirs.has(rel)) {
seenDirs.add(rel);
lines.push(`${rel}/`);
visit(abs, depth + 1);
}
} else if (st.isFile()) {
lines.push(rel);
}
}
}

visit(abs_path, 0);
return lines.join('\n');
}

const SKIP_NAMES = new Set([
'node_modules',
'.git',
'.venv',
'venv',
'dist',
'build',
'target',
'.next',
'.turbo',
'.cache',
'__pycache__',
]);
64 changes: 63 additions & 1 deletion packages/foraging/src/scanner.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import { createHash } from 'node:crypto';
import { readdirSync, statSync } from 'node:fs';
import { join } from 'node:path';
import type { MemoryStore } from '@colony/core';
import { type ExtractedShape, extract, readCapped } from './extractor.js';
import { DEFAULT_SCAN_LIMITS, type FoodSource, type ScanLimits } from './types.js';
import { indexFoodSource } from './indexer.js';
import { DEFAULT_SCAN_LIMITS, type FoodSource, type ScanLimits, type ScanResult } from './types.js';

export interface ScanFsOptions {
repo_root: string;
Expand Down Expand Up @@ -94,3 +96,63 @@ function mergeLimits(partial?: Partial<ScanLimits>): ScanLimits {
max_files_per_source: partial?.max_files_per_source ?? DEFAULT_SCAN_LIMITS.max_files_per_source,
};
}

export interface ScanOptions {
repo_root: string;
store: MemoryStore;
session_id: string;
limits?: Partial<ScanLimits>;
extra_secret_env_names?: readonly string[];
}

/**
* Storage-aware scan. For each discovered food source: check the
* cached `content_hash` on `storage.examples`. If unchanged, skip.
* Otherwise clear stale observations, re-index, and upsert the
* examples row with the new hash + observation count.
*
* Idempotent by construction: running twice on an unchanged tree
* yields the same result the second time (all skipped). A partial
* failure mid-index means the examples row is not upserted, so the
* next run treats the source as changed and retries cleanly.
*/
export function scanExamples(opts: ScanOptions): ScanResult {
const { scanned } = scanExamplesFs({
repo_root: opts.repo_root,
...(opts.limits !== undefined ? { limits: opts.limits } : {}),
});
let skipped_unchanged = 0;
let indexed_observations = 0;

for (const food of scanned) {
const existing = opts.store.storage.getExample(food.repo_root, food.example_name);
if (existing && existing.content_hash === food.content_hash) {
skipped_unchanged += 1;
continue;
}

opts.store.storage.deleteForagedObservations(food.repo_root, food.example_name);

const options: Parameters<typeof indexFoodSource>[2] = {
session_id: opts.session_id,
...(opts.limits?.max_file_bytes !== undefined
? { max_file_bytes: opts.limits.max_file_bytes }
: {}),
...(opts.extra_secret_env_names !== undefined
? { extra_secret_env_names: opts.extra_secret_env_names }
: {}),
};
const count = indexFoodSource(food, opts.store, options);
indexed_observations += count;

opts.store.storage.upsertExample({
repo_root: food.repo_root,
example_name: food.example_name,
content_hash: food.content_hash,
manifest_kind: food.manifest_kind,
observation_count: count,
});
}

return { scanned, skipped_unchanged, indexed_observations };
}
Loading
Loading