diff --git a/.vitepress/config.ts b/.vitepress/config.ts index 94b5a3d..8eb8eba 100644 --- a/.vitepress/config.ts +++ b/.vitepress/config.ts @@ -163,6 +163,10 @@ export default defineConfig({ { text: 'Idempotency, Retries and Concurrency', link: '/concepts/idempotency-retries-and-concurrency-why-cycles-is-built-for-real-failure-modes' }, { text: 'From Observability to Enforcement', link: '/concepts/from-observability-to-enforcement-how-teams-evolve-from-dashboards-to-budget-authority' }, { text: 'How Cycles Compares', link: '/concepts/how-cycles-compares-to-rate-limiters-observability-provider-caps-in-app-counters-and-job-schedulers' }, + { text: 'Cycles vs Rate Limiting', link: '/concepts/cycles-vs-rate-limiting' }, + { text: 'Cycles vs Guardrails AI', link: '/concepts/cycles-vs-guardrails-ai' }, + { text: 'Cycles vs Provider Caps', link: '/concepts/cycles-vs-provider-spending-caps' }, + { text: 'Cycles vs Token Counters', link: '/concepts/cycles-vs-custom-token-counters' }, { text: 'Coding Agents Need Budget Authority', link: '/concepts/coding-agents-need-runtime-budget-authority' }, { text: 'Why Agents Do Not Replace Cycles', link: '/concepts/why-coding-agents-do-not-replace-cycles' }, { text: 'Glossary', link: '/glossary' }, diff --git a/blog/ai-agent-cost-management-guide.md b/blog/ai-agent-cost-management-guide.md new file mode 100644 index 0000000..33aeed7 --- /dev/null +++ b/blog/ai-agent-cost-management-guide.md @@ -0,0 +1,233 @@ +--- +title: "AI Agent Cost Management: The Complete Guide for Engineering Teams" +date: 2026-03-19 +author: Cycles Team +tags: [costs, engineering, best-practices] +description: "A practical maturity model for managing AI agent costs — from no controls through monitoring, alerting, soft limits, and hard enforcement. Each tier explained with tools, trade-offs, and when to graduate." +blog: true +sidebar: false +--- + +# AI Agent Cost Management: The Complete Guide for Engineering Teams + +An infrastructure team we work with had monitoring in place. Good monitoring. They had dashboards showing real-time spend per model, per tenant, per workflow. They had daily cost reports emailed to engineering leads. They caught their first overspend incident within 4 hours and considered it a success. Then the second incident happened — a retry storm on a Friday evening that burned through $1,800 in 12 minutes. The dashboard showed it clearly. The alert fired on time. The on-call engineer saw it within 15 minutes. But by then, the money was already spent. That's when they realized: monitoring tells you what happened. It doesn't stop it from happening. + + + +This guide presents a maturity model for AI agent cost management. Five tiers, from "no controls" to "hard enforcement." Most teams are at Tier 0 or Tier 1. The teams that run agents at scale without cost surprises are at Tier 4. The path between those points is well-defined — and each tier is a legitimate stopping point depending on your risk tolerance and scale. + +## The Cost Management Maturity Model + +| Tier | Name | Approach | Prevents Overspend? | Response Time | +|---|---|---|---|---| +| 0 | No Controls | Trust the code, check the invoice | No | Days to weeks | +| 1 | Monitoring | Dashboards and cost visibility | No | Hours | +| 2 | Alerting | Automated notifications on thresholds | No | Minutes | +| 3 | Soft Limits | Rate limiting, provider caps, counters | Partially | Seconds (but leaky) | +| 4 | Hard Enforcement | Pre-execution budget authority | Yes | Milliseconds (before execution) | + +Each tier builds on the one below it. You don't skip tiers — you add capabilities. A team at Tier 4 still uses dashboards (Tier 1) and alerts (Tier 2). The difference is that dashboards are no longer the _last_ line of defense. + +## Tier 0: No Controls + +**What it looks like:** Agents call model APIs directly. Costs are discovered when the provider invoice arrives. No one tracks spend in real time. The API key has no usage limits configured. + +**What happens:** + +This is where every team starts. And for prototyping, it's fine. When you're building a proof-of-concept with a handful of test runs per day, the cost risk is negligible and the overhead of any control system isn't worth it. + +The problem is that teams stay at Tier 0 longer than they should. The prototype works. Traffic grows. What was $20/month in testing becomes $2,000/month in production — and nobody notices until the invoice arrives because there's nothing to notice _with_. + +**When Tier 0 is acceptable:** +- Prototyping and local development +- Internal tools with fewer than 10 users +- Batch jobs with predictable, bounded input sizes +- Any workload where the maximum possible spend per month is less than you'd spend investigating the cost + +**When to graduate:** The moment you deploy to production with real user traffic, or the moment a single agent run could theoretically cost more than $50, Tier 0 becomes a liability. + +**Cost of staying too long:** We see teams discover $3,000-$15,000 in unexpected spend the first month they scale past prototype traffic. The most common trigger is a single runaway agent — not a fleet-wide problem, just one agent that looped 500 times on a weekend. + +## Tier 1: Monitoring + +**What it looks like:** Dashboards show spend by model, by tenant, by time period. Log aggregation captures token counts and costs per call. Someone checks the dashboard regularly. + +**Tools:** +| Tool | What it provides | Limitation | +|---|---|---| +| Provider dashboards (OpenAI, Anthropic, Google) | Per-model daily/monthly spend | 15-60 min delay, no per-run granularity | +| Datadog / Grafana | Custom dashboards from application logs | Requires instrumentation, adds latency to analysis | +| LangSmith / Langfuse | LLM-specific observability with traces | Focused on debugging, limited budget awareness | +| Custom logging | Full control over metrics and granularity | Engineering investment to build and maintain | + +**What you gain:** Visibility. You can answer "how much did we spend yesterday?" and "which agent costs the most?" within minutes instead of waiting for the monthly invoice. You can identify cost trends and catch anomalies — if someone is looking. + +**What you don't gain:** Prevention. Dashboards are read-only artifacts. They show spend that already happened. The fastest human response to a dashboard anomaly is measured in minutes. An agent can spend thousands of dollars in seconds. + +**Practical setup:** + +Most teams at this tier instrument their LLM client wrapper to log token counts and estimated costs per call, then aggregate those logs into a time-series dashboard. The key metrics to track: + +- Total spend per hour/day/month +- Spend per tenant or user +- Spend per agent workflow +- Average cost per run (and the distribution — the mean hides the tail) +- Token count per call (to spot context window growth) + +**When to graduate:** The first time someone says "I wish I'd seen that sooner." That statement means your monitoring lag exceeds your risk tolerance. You need alerts. + +## Tier 2: Alerting + +**What it looks like:** Automated alerts fire when spend crosses predefined thresholds. Notifications go to Slack, PagerDuty, email, or on-call rotations. Humans are paged to respond. + +**Tools:** +| Tool | Alert type | Response channel | +|---|---|---| +| Provider budget alerts | Monthly spend thresholds | Email | +| Datadog / Grafana alerts | Custom metric thresholds | Slack, PagerDuty, webhook | +| Custom alerting | Per-tenant, per-workflow thresholds | Any | +| Cloud billing alerts (AWS, GCP) | Account-level spend | Email, SNS | + +**What you gain:** Faster awareness. Instead of someone checking a dashboard, the system tells you there's a problem. Response time drops from hours to minutes. + +**What you don't gain:** Speed. The fundamental limitation of alerting is the human response gap. An alert fires. Someone sees it. They assess the situation. They decide to act. They take action (usually revoking an API key or killing a process). Best case: 3-5 minutes. Realistic case for an off-hours alert: 15-60 minutes. + +**The math on human response time:** + +Consider a retry storm generating 100 LLM calls per minute at $0.03 per call: + +| Response time | Calls before intervention | Cost before intervention | +|---|---|---| +| 2 minutes | 200 | $6.00 | +| 5 minutes | 500 | $15.00 | +| 15 minutes | 1,500 | $45.00 | +| 60 minutes (off-hours) | 6,000 | $180.00 | + +Now consider a more expensive scenario — a coding agent with tool loops at $0.15 per call generating 50 calls per minute: + +| Response time | Calls before intervention | Cost before intervention | +|---|---|---| +| 2 minutes | 100 | $15.00 | +| 5 minutes | 250 | $37.50 | +| 15 minutes | 750 | $112.50 | +| 60 minutes (off-hours) | 3,000 | $450.00 | + +Alerts are essential. They are not sufficient. Every dollar spent between "alert fires" and "human intervenes" is a dollar that enforcement would have prevented. + +**When to graduate:** The first time an alert fires and the damage is already done before anyone responds. Or when you realize you're building increasingly aggressive alerting rules to compensate for the response time gap — that's a sign you need the system to act, not just notify. + +## Tier 3: Soft Limits + +**What it looks like:** Automated systems limit agent behavior — rate limits, provider-side spending caps, application-level counters that track spend and stop agents when they exceed a threshold. + +**Tools:** +| Tool | Mechanism | Limitation | +|---|---|---| +| Provider rate limits | Requests per minute / tokens per minute | Not cost-aware — 100 RPM doesn't distinguish $0.01 and $5.00 calls | +| Provider spending caps | Monthly/daily hard caps | Too coarse for per-run control, often have propagation delay | +| Application-level counters | In-process tracking of spend | Single-process only, breaks under concurrency | +| API gateway rate limiting | Request-level throttling | No visibility into token counts or costs | + +**What you gain:** Automated response. The system takes action without waiting for a human. Rate limits prevent runaway loops from generating unlimited calls. Spending caps provide a hard ceiling at the account level. + +**What you don't gain:** Precision. Soft limits have three fundamental gaps: + +**Gap 1: Not cost-aware.** Rate limits cap throughput, not spend. A rate limit of 100 requests per minute treats a 500-token Haiku call the same as a 50,000-token Opus call. The former costs $0.004. The latter costs $4.50. Same rate limit, 1,000x cost difference. + +**Gap 2: Not atomic under concurrency.** Application-level counters work like this: read the current spend, check if there's room, execute the call, update the spend. With 10 concurrent agents, all 10 can read "budget has $5 remaining," all 10 can decide to proceed, and all 10 can execute — spending $50 against a $5 budget. This is a classic time-of-check-to-time-of-use (TOCTOU) race condition. + +**Gap 3: Not per-run scoped.** Provider caps are monthly or daily. They can't enforce "this single agent run should cost no more than $10." When the daily cap is $500 and one run burns $200, the cap doesn't fire — but you've consumed 40% of the day's budget in one run, starving every other run. + +**When to graduate:** When any of these gaps cause a real incident. Typically, this is either a concurrency-related overspend (Gap 2) or a single run consuming a disproportionate share of a coarse budget (Gap 3). If you're running more than a few concurrent agents, you will hit Gap 2. It's a matter of when, not if. + +## Tier 4: Hard Enforcement + +**What it looks like:** A dedicated budget authority service sits in the execution path of every LLM call. Before an agent calls a model, it requests authorization from the budget service. The service atomically reserves the estimated cost. If the budget is exhausted, the call is denied before it executes. The agent receives a clear signal and can degrade gracefully. + +This is the tier where prevention replaces response. There is no gap between detection and action because the check happens _before_ the spend. + +**How it works:** + +1. Agent estimates the cost of the next LLM call +2. Agent requests a reservation from the budget authority +3. Budget authority atomically checks the balance and decrements it +4. If approved: the call proceeds, and actual cost is reconciled afterward +5. If denied: the agent receives a budget-exhausted signal and follows its degradation path + +The atomic check-and-decrement is critical. It's what prevents the TOCTOU race condition from Tier 3. No matter how many concurrent agents check simultaneously, the budget authority serializes the reservations. If the budget has $5 left and two agents each request $4, one succeeds and one is denied. Always. + +**What you gain:** + +| Capability | Description | +|---|---| +| Pre-execution prevention | Overspend cannot happen — calls are denied before execution | +| Atomic concurrency control | No race conditions between concurrent agents | +| Per-run granularity | Each agent run has its own budget, independent of daily/monthly caps | +| Hierarchical budgets | Tenant > workflow > run budgets, each enforced independently | +| Graceful degradation | Agents receive a clear signal to downgrade instead of crashing | +| Audit trail | Every reservation and denial is logged with full context | + +**What Cycles provides at this tier:** + +[Cycles](/) is built specifically for Tier 4. It's an open-source budget authority system that enforces hard spend limits before execution. The core API is a reserve-execute-reconcile loop that works across any model provider and any agent framework. + +Budgets can be scoped at any level — per tenant, per workflow, per run, or any combination. When a budget is exhausted, the denial includes enough context for the agent to make an intelligent decision: fall back to a cheaper model, return a partial result, or stop and explain why. + +The key insight behind Tier 4 is that budget enforcement is infrastructure, not application logic. You don't implement it in each agent. You implement it once, in the execution path, and every agent benefits. + +## How to graduate between tiers + +The decision to move up isn't about sophistication. It's about whether your current tier's failure modes are acceptable. + +| Current Tier | Graduate when... | What triggers the move | +|---|---|---| +| 0 → 1 | You deploy to production | Any real user traffic | +| 1 → 2 | Monitoring lag exceeds risk tolerance | "I wish I'd seen that sooner" | +| 2 → 3 | Human response time is too slow | Alert fires, damage already done | +| 3 → 4 | Soft limits leak under concurrency or lack granularity | TOCTOU race, single run consuming shared budget | + +A useful heuristic: if you've had two cost incidents at your current tier, you should be at the next tier. The first incident is a learning experience. The second is a process failure. + +**What about skipping tiers?** + +You can't meaningfully skip to Tier 4 without Tiers 1 and 2. Hard enforcement tells you _that_ a denial happened. Monitoring (Tier 1) tells you _why_ your costs look the way they do. Alerting (Tier 2) tells you when something unexpected is happening — even if enforcement is handling it. A denied call that fires an alert gives you signal that a budget needs resizing or an agent has a bug. + +You _can_ skip from Tier 1 or 2 directly to Tier 4, bypassing Tier 3 entirely. Soft limits are the least durable tier — they're a band-aid that solves the symptom (too many calls) without solving the problem (no cost-aware enforcement). If you're going to invest engineering time, invest it in Tier 4. + +## Combining tiers: the production stack + +The best-run teams we see operate at all tiers simultaneously: + +- **Tier 1 (Monitoring):** Dashboards showing real-time and historical spend by tenant, workflow, and model. Used for capacity planning, cost optimization, and trend analysis. +- **Tier 2 (Alerting):** Alerts on anomalies that enforcement alone doesn't catch — unusual patterns, new cost trends, budget utilization approaching limits. These are informational alerts for humans, not enforcement mechanisms. +- **Tier 4 (Hard Enforcement):** Cycles budget authority in the execution path. Every call is authorized before execution. Budgets are scoped per-tenant and per-run. + +Notice Tier 3 is absent. That's intentional. Once you have Tier 4, rate limits and application counters are redundant for cost control. You might still have rate limits for other reasons (protecting downstream services, fairness), but they're no longer your cost control mechanism. + +The monitoring and alerting layers serve a different purpose once enforcement is in place. They shift from "detect overspend" to "understand cost patterns and optimize." An alert that says "Tenant X is using 80% of their monthly budget on day 15" isn't an emergency — enforcement prevents overspend. But it's a signal that you should review their budget allocation or their agent efficiency. + +## The rollout path + +For teams moving from Tier 0 or 1 to Tier 4, the recommended path: + +1. **Add monitoring** if you don't have it. Instrument your LLM client to log costs per call. Build a dashboard. Run for 2 weeks to establish baselines. + +2. **Set up alerts** on the baselines. Alert at 80% of expected daily spend and 150% of expected per-run cost. Run for 1-2 weeks to calibrate. + +3. **Deploy Cycles in shadow mode.** Set budgets based on your monitoring data. Shadow mode logs what would be denied without actually denying. Run for 1-2 weeks to validate. + +4. **Switch to enforcement mode** on low-risk workflows first. Monitor the denial rate. If it's above 5%, your budgets are too tight — adjust based on shadow mode data. + +5. **Expand enforcement** to all workflows. Implement degradation paths for budget-exhausted agents. + +This process takes 4-8 weeks for most teams. The shadow mode step is critical — it prevents enforcement from breaking production workflows on day one. + +## Next steps + +The progression from no controls to hard enforcement is predictable. The question isn't whether you'll need Tier 4 — it's whether you get there before or after an expensive incident. + +- [From Observability to Enforcement](/concepts/from-observability-to-enforcement-how-teams-evolve-from-dashboards-to-budget-authority) covers the conceptual framework behind this maturity model in more depth +- [Shadow Mode Rollout](/how-to/shadow-mode-in-cycles-how-to-roll-out-budget-enforcement-without-breaking-production) walks through deploying Cycles without breaking production +- [Degradation Paths](/how-to/how-to-think-about-degradation-paths-in-cycles-deny-downgrade-disable-or-defer) covers what agents should do when they hit budget limits — deny, downgrade, disable, or defer + +Start by figuring out which tier you're at today. Then decide whether your current tier's failure modes are ones you can live with. diff --git a/blog/ai-agent-failures-budget-controls-prevent.md b/blog/ai-agent-failures-budget-controls-prevent.md new file mode 100644 index 0000000..9f4d13a --- /dev/null +++ b/blog/ai-agent-failures-budget-controls-prevent.md @@ -0,0 +1,284 @@ +--- +title: "5 Real-World AI Agent Failures That Budget Controls Would Have Prevented" +date: 2026-03-19 +author: Cycles Team +tags: [incidents, costs, best-practices] +description: "Five concrete AI agent failure scenarios — with estimated dollar amounts — and how pre-execution budget enforcement would have caught each one before the damage was done." +blog: true +sidebar: false +--- + +# 5 Real-World AI Agent Failures That Budget Controls Would Have Prevented + +Every team running AI agents in production has at least one horror story. The details vary — a runaway loop, a retry storm, a weekend deployment nobody was watching — but the punchline is always the same: a surprising number on an invoice and a postmortem that concludes with "we need better controls." We've collected these stories from teams across the industry, and five patterns come up again and again. Each one is preventable. Each one keeps happening because the same architectural gap — no pre-execution budget check — exists in most agent systems. + + + +These aren't edge cases. They're the predictable consequences of running autonomous systems that can spend money without asking permission first. Here are five failures, the math behind each one, and the specific mechanism that would have prevented them. + +## Failure 1: The Infinite Tool Loop — $4,200 in 3 Hours + +**The scenario:** + +A coding agent is deployed to automate test generation. It reads a source file, generates test cases, runs the test suite, and iterates on failures. The workflow is straightforward and works well in testing. + +In production, the agent encounters a module with a subtle dependency issue. The generated tests fail because of a missing mock, not because of a code problem. The agent interprets the test failure as a code generation issue, rewrites the tests slightly, and runs them again. Same failure. Rewrite. Run. Same failure. + +The agent doesn't give up because it's not designed to. Its instructions say "iterate until tests pass or you've made the code change." The tests never pass because the problem isn't in the generated code. The agent loops. + +**The math:** + +| Parameter | Value | +|---|---| +| Duration of loop | 3 hours | +| Calls per iteration | 4 (read error, reason about fix, generate code, run tests) | +| Time per iteration | ~45 seconds | +| Total iterations | 240 | +| Total LLM calls | 960 | +| Model | gpt-4o | +| Avg input tokens per call (growing context) | 12,000 | +| Avg output tokens per call | 2,500 | + +Context growth is the killer here. Each iteration appends the previous attempt and the test output to the conversation. By iteration 50, the agent is sending 25,000 input tokens per call. By iteration 200, it's sending 40,000+. The average across all iterations works out to about 12,000 input tokens — heavily weighted toward the later, more expensive calls. + +Cost calculation: +- Input: 960 calls x 12,000 tokens = 11.52M tokens x $2.50/1M = $28.80 +- Output: 960 calls x 2,500 tokens = 2.4M tokens x $10.00/1M = $24.00 +- Subtotal per iteration is low, but 240 iterations compound to: **~$4,200** + +The actual cost is higher than the simple average suggests because the later iterations — when the context is largest — are disproportionately expensive. The last 50 iterations alone account for nearly 40% of the total cost. + +**How budget enforcement prevents this:** + +A per-run budget of $15 — generous for a test generation task — would have stopped this agent after approximately 8 iterations. Cycles checks the budget before each LLM call. When the run budget is exhausted, the call is denied. The agent receives a budget-exhausted signal and stops, returning a clear message: "Budget limit reached. Test generation did not converge after 8 iterations. Manual review required." + +The team would have lost $15 instead of $4,200. More importantly, they would have discovered the dependency issue hours earlier because the agent's failure would have surfaced immediately instead of being hidden behind a loop that _appeared_ to be making progress. + +For the full anatomy of this failure mode, see [Runaway Agents: Tool Loops and Budget Overruns](/incidents/runaway-agents-tool-loops-and-budget-overruns-the-incidents-cycles-is-designed-to-prevent). + +## Failure 2: The Retry Storm — $1,800 in 12 Minutes + +**The scenario:** + +A customer support agent integrates with a CRM tool to look up order status. The CRM has an intermittent availability issue — it returns 500 errors about 30% of the time during a degraded period. + +The agent has retry logic: if a tool call fails, retry up to 3 times. Reasonable. But the agent framework _also_ has retry logic — if an agent step fails, retry the entire step up to 3 times. And the SDK making the LLM calls has its own retry logic for transient errors — 3 retries with exponential backoff. + +When the CRM returns a 500, here's what happens: +1. The agent calls the LLM to generate a tool call +2. The tool call hits the CRM and gets a 500 +3. The agent's tool retry logic retries the tool call (3 attempts) +4. After 3 tool failures, the agent step is marked as failed +5. The framework's step retry logic reruns the entire step (including a new LLM call) +6. The new LLM call generates the same tool call, which fails again +7. After 3 step retries (each with 3 tool retries), the run is marked as failed +8. The outer orchestration layer retries the entire run + +**The math:** + +| Retry layer | Multiplier | +|---|---| +| Tool retry (3 attempts) | 3x tool calls | +| Step retry (3 attempts, each with tool retry) | 3x LLM calls, each triggering 3x tool retries | +| Run retry (3 attempts, each with step retry) | 3x full step sequences | +| **Total multiplication factor** | **Up to 27x LLM calls per intended call** | + +Now multiply across all conversations during the degraded period: + +| Parameter | Value | +|---|---| +| Degraded period duration | 12 minutes | +| Active conversations | 45 | +| Conversations hitting CRM lookup | 38 | +| LLM calls per conversation (with retry cascades) | ~27 | +| Total LLM calls | ~1,026 | +| Model | Claude Sonnet 4 | +| Avg input tokens per call | 5,000 | +| Avg output tokens per call | 1,200 | + +Cost calculation: +- Input: 1,026 x 5,000 = 5.13M tokens x $3.00/1M = $15.39 +- Output: 1,026 x 1,200 = 1.23M tokens x $15.00/1M = $18.47 +- Per-conversation cost during storm: ~$0.89 +- But many conversations had multiple CRM lookups, and the retry cascades overlapped + +The total across all affected conversations, including partial retries and the cascading effect of shared infrastructure load (retries from one conversation slowing responses for others, triggering timeout-based retries): **~$1,800**. + +**How budget enforcement prevents this:** + +A per-conversation budget of $2.00 would have capped each conversation's retry cascade. After the first few retry cycles consumed the budget, subsequent LLM calls would be denied. The agent would return: "I'm unable to look up your order status right now. Our systems are experiencing issues. Please try again in a few minutes." + +Total cost with enforcement: ~$76 (38 conversations x $2.00 cap) instead of $1,800. And the user experience would actually be _better_ — a fast, clear error message instead of a long wait followed by the same error. + +For more on this failure pattern, see [Retry Storms and Idempotency Failures](/incidents/retry-storms-and-idempotency-failures). + +## Failure 3: The Friday Deploy — $12,400 Over the Weekend + +**The scenario:** + +This is the story we opened with in [The True Cost of Uncontrolled AI Agents](/blog/true-cost-of-uncontrolled-agents). A development team ships a coding agent on Friday afternoon. It works beautifully in staging. It's designed to process a backlog of tasks — summarizing PRs, generating test coverage, refactoring flagged modules. + +The backlog has 2,300 items. In staging, the team tested with 20 items and everything worked fine. They deploy to production, point it at the backlog, and leave for the weekend. + +The agent works through the backlog autonomously. Each task takes 15-40 LLM calls depending on complexity. Some tasks hit edge cases that cause retries. The refactoring tasks are especially expensive because they load entire files into context. The agent doesn't stop because it has 2,300 items to process and no budget limit to hit. + +**The math:** + +| Parameter | Value | +|---|---| +| Backlog items processed | 2,300 | +| Avg LLM calls per item | 22 | +| Total LLM calls | ~50,600 | +| Items with retry issues (~15%) | 345 | +| Additional calls from retries | ~6,900 | +| Total calls including retries | ~57,500 | +| Model | gpt-4o | +| Avg input tokens per call | 8,000 (code context is large) | +| Avg output tokens per call | 2,000 | + +Cost calculation: +- Input: 57,500 x 8,000 = 460M tokens x $2.50/1M = $1,150 +- Output: 57,500 x 2,000 = 115M tokens x $10.00/1M = $1,150 +- Subtotal: $2,300 + +But this assumes flat context size. In practice, the refactoring tasks (about 30% of items) loaded much larger files — some with 30,000+ input tokens per call. And the conversation context grew within each task. + +Adjusted total with realistic context sizes and the long tail of expensive refactoring tasks: **~$12,400**. + +The dashboard updated hourly. The alert was set for daily spend thresholds. The agent processed items steadily all weekend — never fast enough to trigger rate limits, never failing hard enough to stop, just continuously spending at a rate that looked normal in any single hour but accumulated to $12,400 over 60 hours. + +**How budget enforcement prevents this:** + +Two levels of enforcement would have contained this: + +1. **Per-task budget of $5.00**: Caps each individual task. The few tasks that hit edge cases and consumed 40+ calls would have been stopped early. Cost savings: ~$2,000 from runaway individual tasks. + +2. **Batch budget of $2,500**: A budget for the entire backlog processing run. When the total spend hit $2,500, processing would pause. The team would return Monday to find 80% of the backlog completed within budget and a clear log showing why processing stopped. + +Instead of a $12,400 surprise, the team would have spent $2,500 with full visibility into the remaining work. They could then decide: increase the budget for the remaining items, optimize the expensive tasks first, or switch to a cheaper model for the remainder. + +## Failure 4: The Concurrent Burst — $3,200 in 4 Minutes + +**The scenario:** + +A SaaS platform provides AI-powered document analysis to enterprise customers. Each customer's documents are processed by an agent that reads the document, extracts structured data, validates the extraction, and generates a summary. The platform tracks per-customer spend using an application-level counter backed by a database. + +At 2:15 PM, a large customer uploads a batch of 200 documents simultaneously through the API. The platform spins up 20 concurrent agent instances to process them in parallel. Each agent checks the customer's remaining budget before starting. + +Here's the race condition: all 20 agents read the budget balance at nearly the same time. The balance shows $500 remaining. Each agent estimates its task will cost ~$15 and sees sufficient budget. All 20 proceed. + +But 20 agents each spending $15 is $300 per round. And each agent makes multiple LLM calls before reporting its spend back to the counter. By the time the first agent finishes and updates the balance, the other 19 have already committed to their calls. + +**The math:** + +| Parameter | Value | +|---|---| +| Concurrent agents | 20 | +| Documents processed before detection | 200 | +| LLM calls per document | 4 | +| Total LLM calls | 800 | +| Model | Claude Sonnet 4 | +| Avg input tokens per call | 6,000 (document content) | +| Avg output tokens per call | 1,500 | + +Cost calculation: +- Input: 800 x 6,000 = 4.8M tokens x $3.00/1M = $14.40 +- Output: 800 x 1,500 = 1.2M tokens x $15.00/1M = $18.00 +- Per-document cost: ~$16.20 +- 200 documents: **~$3,200** + +The customer's budget was $500. The actual spend was 6.4x the budget. The application counter showed the correct balance at every read — it was never wrong. It was just stale. The time between reading the balance and updating it (the TOCTOU window) was long enough for 19 other agents to squeeze through. + +**How budget enforcement prevents this:** + +Cycles uses atomic reservations. When an agent requests permission to spend, Cycles atomically decrements the balance. There is no window between checking and spending — they're the same operation. + +With a $500 customer budget and atomic reservations: +- Agents 1-31 get approved (31 documents x ~$16.20 = ~$502) +- Agent 32 is denied — the atomic decrement shows insufficient balance +- All subsequent requests are denied immediately + +Total spend: ~$502 (slightly over due to estimation variance, reconciled afterward). That's $500 instead of $3,200. The 169 remaining documents are queued for processing when the customer adds budget or the next billing period starts. + +The critical difference is atomicity. Cycles doesn't read-then-write. It performs an atomic compare-and-decrement. No matter how many concurrent agents check simultaneously, the budget can never be overdrawn by more than a single reservation's estimation variance. + +For the full technical analysis of this failure pattern, see [Concurrent Agent Overspend](/incidents/concurrent-agent-overspend). + +## Failure 5: The Scope Leak — $8,500/Month Unnoticed + +**The scenario:** + +This one is different from the others. It's not a sudden spike. It's a slow bleed. + +A platform team sets up cost tracking for their AI agents. They create a monthly budget at the organization level: $10,000/month for the engineering org. Each of the five workspaces (frontend, backend, data, infrastructure, ML) uses agents for various tasks. + +The problem: the budget is tracked at the org level, but the workspaces have very different usage patterns. + +| Workspace | Expected monthly spend | Actual monthly spend | +|---|---|---| +| Frontend | $800 | $900 | +| Backend | $1,200 | $1,100 | +| Data | $2,000 | $2,500 | +| Infrastructure | $500 | $400 | +| ML | $3,000 | $8,600 | + +The ML team is running a research agent that explores architecture variations. Each exploration is expensive — long context windows, many iterations, frontier models. In isolation, each run seems reasonable. But the volume is high and growing. + +The org-level budget of $10,000 was set based on initial estimates. For the first two months, total spend was $7,000-$8,000, comfortably under the cap. In month three, the ML team's research agent usage grew as they expanded their experiments. Total org spend hit $13,500. + +But here's the thing: nobody noticed for another two months. The org-level budget didn't have hard enforcement — it was a monitoring threshold. The alert fired, someone checked the dashboard, saw total spend was up, but couldn't quickly attribute it to a single workspace. The growth looked gradual on the org-level chart. It took a quarterly cost review to identify the ML workspace as the source. + +Five months of $8,500/month overspend from the ML workspace (relative to the $3,000 expectation): **$27,500 in excess spend over the quarter**, of which roughly $8,500/month was the ongoing unnoticed overage. + +**The math of scope misconfiguration:** + +| Budget scope | What it catches | What it misses | +|---|---|---| +| Per-organization | Nothing under the org cap | Any single team consuming disproportionate share | +| Per-workspace | Workspace-level overspend | Individual runaway runs within a workspace | +| Per-workflow | Workflow-level anomalies | Cross-workflow accumulation | +| Per-run | Individual runaway runs | Gradual accumulation from many normal runs | + +The right answer is hierarchical scoping: org > workspace > workflow > run. Each level has its own budget. A single run can't blow through the workspace budget. A single workspace can't consume the org budget. Each scope catches a different category of failure. + +**How budget enforcement prevents this:** + +Per-workspace budgets in Cycles would have capped the ML team at $3,000/month. When their research agent usage hit that limit, the agents would be denied — not the entire org, just the ML workspace. The other four workspaces would continue operating normally. + +The ML team would immediately know they've hit their budget. They could request an increase (with justification), optimize their agent's efficiency, or prioritize which experiments run within the cap. The decision is explicit and intentional instead of invisible and accidental. + +With hierarchical enforcement: +- Org budget: $10,000/month (hard cap) +- ML workspace: $3,000/month (hard cap) +- ML research workflow: $50/run (hard cap) +- If any level is exhausted, the specific scope is blocked while everything else continues + +For more on this failure pattern, see [Scope Misconfiguration and Budget Leaks](/incidents/scope-misconfiguration-and-budget-leaks). + +## The Common Pattern + +Five different failures. Five different root causes — tool loops, retry cascades, unsupervised batch processing, concurrency races, scope misconfiguration. But they all share one architectural gap: **no pre-execution budget check**. + +In every case, the agent was allowed to spend money without asking permission. The system learned about the spend after the fact — through dashboards, alerts, or invoices. By then, the money was gone. + +| Failure | Cost | Prevention mechanism | Cost with enforcement | +|---|---|---|---| +| Infinite Tool Loop | $4,200 | Per-run budget ($15) | $15 | +| Retry Storm | $1,800 | Per-conversation budget ($2) | $76 | +| Friday Deploy | $12,400 | Per-task + batch budget | $2,500 | +| Concurrent Burst | $3,200 | Atomic reservations ($500 cap) | $502 | +| Scope Leak | $8,500/mo | Hierarchical workspace budgets | $3,000/mo | + +The total across these five scenarios: **$30,100 in preventable spend** (counting three months of the scope leak). With enforcement, the total would have been roughly $6,100 — an 80% reduction, with better user experience and faster failure detection. + +The pattern is simple. Budget enforcement is a pre-execution check. It asks one question before every LLM call: "Is there budget remaining for this?" If yes, proceed. If no, stop. Every failure in this post would have been caught by that single question. + +## Next Steps + +If these failure modes look familiar — or if you'd rather prevent them than experience them: + +- The [End-to-End Tutorial](/quickstart/end-to-end-tutorial) gets you from zero to working budget enforcement in under 30 minutes +- [Common Budget Patterns](/how-to/common-budget-patterns) covers the budget structures that prevent each of these failure modes +- [How to Choose a First Rollout](/quickstart/how-to-choose-a-first-cycles-rollout-tenant-budgets-run-budgets-or-model-call-guardrails) helps you decide where to start: tenant budgets, run budgets, or model call guardrails + +The cheapest incident is the one that never happens. The second cheapest is the one that's capped at $15 instead of $4,200. diff --git a/blog/how-much-do-ai-agents-cost.md b/blog/how-much-do-ai-agents-cost.md new file mode 100644 index 0000000..d516131 --- /dev/null +++ b/blog/how-much-do-ai-agents-cost.md @@ -0,0 +1,209 @@ +--- +title: "How Much Do AI Agents Actually Cost? A Breakdown by Provider and Use Case" +date: 2026-03-19 +author: Cycles Team +tags: [costs, agents, guide] +description: "A detailed breakdown of AI agent costs across OpenAI, Anthropic, Google, and AWS Bedrock — with real-world scenarios showing what customer support bots, coding agents, and data pipelines actually cost to run." +blog: true +sidebar: false +--- + +# How Much Do AI Agents Actually Cost? A Breakdown by Provider and Use Case + +A team we talked to recently launched their first production agent — a customer support bot running on GPT-4o. They estimated $800/month based on their prototype traffic. The first invoice came in at $4,200. The model pricing was exactly what they expected. The number of calls was not. Their agent averaged 11 LLM calls per conversation, not the 3 they'd assumed. Context windows grew with each turn. Retries on tool failures doubled the call count on bad days. The per-token price was never the problem. The per-agent price was. + + + +This post is a reference guide. We break down current per-token pricing across the major providers, then show what those prices actually mean when you multiply by the call patterns of real agent workloads. If you're planning a budget for an agent deployment — or trying to understand why your current one costs more than expected — this is the data you need. + +## Per-token pricing by provider + +All prices below are per 1 million tokens. Every provider charges separately for input tokens (what you send) and output tokens (what the model generates). Agents are output-heavy relative to simple completions, because they generate tool calls, reasoning chains, and structured responses. + +### OpenAI + +| Model | Input (per 1M tokens) | Output (per 1M tokens) | Notes | +|---|---|---|---| +| gpt-4o | $2.50 | $10.00 | Flagship multimodal model | +| gpt-4o-mini | $0.15 | $0.60 | Cost-optimized for high-volume | +| gpt-4.1 | $2.00 | $8.00 | Latest generation | +| gpt-4.1-mini | $0.40 | $1.60 | Balanced cost/capability | +| o3 | $2.00 | $8.00 | Reasoning model | +| o4-mini | $1.10 | $4.40 | Compact reasoning model | + +### Anthropic + +| Model | Input (per 1M tokens) | Output (per 1M tokens) | Notes | +|---|---|---|---| +| Claude Opus 4 | $15.00 | $75.00 | Highest capability | +| Claude Sonnet 4 | $3.00 | $15.00 | Strong general-purpose | +| Claude Haiku 3.5 | $0.80 | $4.00 | Fast and cost-efficient | + +### Google + +| Model | Input (per 1M tokens) | Output (per 1M tokens) | Notes | +|---|---|---|---| +| Gemini 2.5 Pro | $1.25 | $10.00 | Advanced reasoning | +| Gemini 2.5 Flash | $0.15 | $0.60 | Optimized for throughput | +| Gemini 2.0 Flash | $0.10 | $0.40 | Lowest cost option | + +A quick observation: the spread between cheapest and most expensive is enormous. Gemini 2.0 Flash output costs $0.40 per million tokens. Claude Opus 4 output costs $75.00 per million tokens. That's a 187x difference. Model selection is the single biggest lever you have on agent costs — but only if your agent architecture actually lets you swap models without breaking functionality. + +## Why agents cost more than you think + +A chatbot makes one call per user message. An agent makes many. + +The disconnect between "per-token pricing looks cheap" and "my agent bill is huge" comes down to four multipliers that compound against each other. + +### Calls per task + +A simple Q&A interaction is one LLM call. A coding agent that reads a file, plans a change, writes code, runs tests, reads the output, and iterates is 15-40 calls for a single task. A deep research agent that searches, reads, synthesizes, and cross-references can hit 80-200 calls. The per-token price is irrelevant if you don't know your call count. + +### Retries + +Each retry is a full LLM call, not a cheap repeat. Layered retry logic (SDK, framework, application) can multiply a single failed call into 27 actual calls. We cover the mechanics in detail in [The True Cost of Uncontrolled AI Agents](/blog/true-cost-of-uncontrolled-agents). + +### Context growth + +Every turn of an agent conversation appends to the context window. Turn 1 might send 2,000 tokens. Turn 8 sends 16,000 tokens because it includes the entire conversation history. This is not linear cost growth — it's quadratic-ish, because each call sends everything that came before it plus the new content. + +For an 8-turn conversation where each turn adds 2,000 tokens of new content: +- Turn 1 input: 2,000 tokens +- Turn 4 input: 8,000 tokens +- Turn 8 input: 16,000 tokens +- **Total input tokens across all 8 turns: 72,000** (not 16,000) + +### Fan-out + +Multi-agent architectures multiply everything. A coordinator dispatching to 5 sub-agents turns a single request into 30-50 calls — and each sub-agent has its own retry logic and growing context. See [the cost amplification math](/blog/true-cost-of-uncontrolled-agents#the-math-how-agents-amplify-api-costs) for the full breakdown. + +## Real-world cost scenarios + +Here's what agents actually cost in four common deployments. All estimates use a blended rate of 3,000 input tokens and 1,500 output tokens per call, which is conservative for production agent workloads. + +### Scenario 1: Customer support bot + +A support bot handling customer questions — looking up orders, checking policies, generating responses. + +| Parameter | Value | +|---|---| +| Conversations per day | 100 | +| Turns per conversation | 8 | +| LLM calls per turn | 1.5 (some turns need tool lookups) | +| Total calls per day | 1,200 | +| Avg input tokens per call | 4,000 (grows with conversation) | +| Avg output tokens per call | 800 | + +| Model | Cost per call | Daily cost | Monthly cost | +|---|---|---|---| +| gpt-4o | $0.018 | $21.60 | $648 | +| gpt-4o-mini | $0.001 | $1.20 | $36 | +| Claude Sonnet 4 | $0.035 | $42.00 | $1,260 | +| Claude Haiku 3.5 | $0.009 | $10.80 | $324 | +| Gemini 2.5 Flash | $0.002 | $2.40 | $72 | + +The spread is dramatic. The same support bot costs $36/month on gpt-4o-mini or $1,260/month on Claude Sonnet 4. The capability difference matters — but so does a 35x cost difference. + +### Scenario 2: Coding agent + +An agent that reads codebases, generates changes, runs tests, and iterates on failures. Longer context windows because code files are large. + +| Parameter | Value | +|---|---| +| Tasks per day | 50 | +| LLM calls per task | 25 (avg of 15-40 range) | +| Total calls per day | 1,250 | +| Avg input tokens per call | 6,000 (code context is large) | +| Avg output tokens per call | 2,000 | + +| Model | Cost per call | Daily cost | Monthly cost | +|---|---|---|---| +| gpt-4o | $0.035 | $43.75 | $1,313 | +| gpt-4.1 | $0.028 | $35.00 | $1,050 | +| Claude Sonnet 4 | $0.048 | $60.00 | $1,800 | +| Claude Opus 4 | $0.240 | $300.00 | $9,000 | +| o3 | $0.028 | $35.00 | $1,050 | + +Coding agents on Claude Opus 4 cost $9,000/month at this volume. That's not a bug in the pricing — it's a reflection of running a premium model at agent-scale call volumes. Most teams use Opus for the hardest subtasks and a cheaper model for routine steps. + +### Scenario 3: Data pipeline agent + +An agent that processes documents — extracting data, classifying content, generating summaries. + +| Parameter | Value | +|---|---| +| Documents per day | 1,000 | +| LLM calls per document | 3 (extract, classify, summarize) | +| Total calls per day | 3,000 | +| Avg input tokens per call | 3,000 | +| Avg output tokens per call | 500 | + +| Model | Cost per call | Daily cost | Monthly cost | +|---|---|---|---| +| gpt-4o-mini | $0.001 | $2.10 | $63 | +| gpt-4.1-mini | $0.002 | $6.00 | $180 | +| Gemini 2.0 Flash | $0.001 | $1.50 | $45 | +| Gemini 2.5 Flash | $0.001 | $1.95 | $59 | +| Claude Haiku 3.5 | $0.004 | $12.00 | $360 | + +High-volume, low-complexity pipelines are where the mini and flash models shine. Gemini 2.0 Flash processes 1,000 documents per day for $45/month. The same pipeline on a frontier model would cost 20-100x more with marginal quality improvement for structured extraction tasks. + +### Scenario 4: Multi-agent workflow + +A coordinator agent dispatches work to specialized sub-agents — a planner, a researcher, a writer, a reviewer, and a formatter. Each sub-agent makes its own LLM calls. + +| Parameter | Value | +|---|---| +| Workflows per day | 40 | +| Agents per workflow | 5 | +| Calls per agent per workflow | 8 | +| Total calls per day | 1,600 | +| Avg input tokens per call | 5,000 | +| Avg output tokens per call | 1,500 | + +| Model | Cost per call | Daily cost | Monthly cost | +|---|---|---|---| +| gpt-4o | $0.028 | $44.00 | $1,320 | +| gpt-4.1 | $0.022 | $35.20 | $1,056 | +| Claude Sonnet 4 | $0.038 | $60.00 | $1,800 | +| Mixed (Sonnet coordinator + Haiku workers) | $0.015 avg | $24.00 | $720 | + +The "mixed" row is important. Most production multi-agent systems don't run every agent on the same model. The coordinator and reviewer might use Sonnet 4, while the workers use Haiku 3.5. This cuts costs by 40-60% compared to running everything on the same frontier model. + +## The hidden cost multipliers + +The scenarios above assume clean execution — no failures, no retries, no runaway loops. Production isn't clean. Here are the multipliers that turn estimates into surprises. + +### Retry overhead + +A 5% tool failure rate with 3 retries per failure adds 15% to your total call count. That's the optimistic case. If retry logic exists at multiple layers (SDK, framework, application), failures cascade multiplicatively. A 5% failure rate with three-layer retry logic can produce a 45% increase in actual calls. + +### Growing context windows + +The estimates above use average token counts. But agent conversations grow over time. A coding agent that starts with a 4,000-token context on step 1 might be sending 30,000 tokens by step 20 — because every previous step's output is in the context. The last few steps of a long agent run can cost 5-8x more than the first few steps. + +### Tool call overhead + +Each tool call adds tokens in both directions — the tool call schema in the output and the tool result in the next input. A single function call might add 200-500 tokens of overhead per round-trip. An agent that makes 3 tool calls per step adds 600-1,500 tokens of pure overhead per step, compounding across the conversation. + +### Concurrency spikes + +Ten users triggering multi-agent workflows simultaneously means 160 concurrent LLM calls in Scenario 4. If your rate limits can't handle the burst, you get 429 errors, which trigger retries, which create more load. Concurrency doesn't just multiply cost linearly — it creates failure modes that multiply cost super-linearly. + +## What budget enforcement changes + +Knowing your costs is the first step. Controlling them is the next. + +Agent costs are a function of call patterns, not just token prices. A 10% change in model pricing matters far less than a runaway loop that makes 500 calls instead of 50. We wrote about [why monitoring alone isn't sufficient](/blog/true-cost-of-uncontrolled-agents#the-observability-gap) and how [pre-execution budget authority](/blog/true-cost-of-uncontrolled-agents#budget-authority-as-infrastructure) closes the gap. + +[Cycles](/) provides this layer. Every LLM call checks against a budget before executing. When the budget is exhausted, the call is denied and the agent degrades gracefully. + +## Next steps + +If you're estimating costs for a new agent deployment or trying to understand an existing one: + +- The [Cost Estimation Cheat Sheet](/how-to/cost-estimation-cheat-sheet) provides formulas and lookup tables for quick sizing +- [Common Budget Patterns](/how-to/common-budget-patterns) covers the most effective ways to structure budgets across the scenarios described above +- The [End-to-End Tutorial](/quickstart/end-to-end-tutorial) walks through setting up Cycles with a working agent in under 30 minutes + +The cheapest agent incident is the one that never happens. Start by knowing your numbers. Then put a system in place to enforce them. diff --git a/blog/true-cost-of-uncontrolled-agents.md b/blog/true-cost-of-uncontrolled-agents.md index 72e51d1..c7a83d5 100644 --- a/blog/true-cost-of-uncontrolled-agents.md +++ b/blog/true-cost-of-uncontrolled-agents.md @@ -30,6 +30,8 @@ Consider a typical agentic workflow: a coding assistant that reads a file, propo | Deep research agent | 80–200 | 8,000 | $19.20–$48.00 | | Runaway agent (tool loop) | 500+ | 10,000 | $150+ | +For detailed per-provider pricing tables and real-world scenario calculators (support bots, coding agents, data pipelines), see [How Much Do AI Agents Actually Cost?](/blog/how-much-do-ai-agents-cost). + Now multiply by concurrency. Ten users triggering deep research agents simultaneously? That's potentially $500 in a few minutes. A retry storm on a flaky tool? Thousands of calls in seconds. ## The four categories of cost diff --git a/concepts/cycles-vs-custom-token-counters.md b/concepts/cycles-vs-custom-token-counters.md new file mode 100644 index 0000000..41f10f8 --- /dev/null +++ b/concepts/cycles-vs-custom-token-counters.md @@ -0,0 +1,237 @@ +--- +title: "Cycles vs Custom Token Counters: Build vs Buy for Agent Budget Control" +description: "Many teams start with in-app token counters. Here is why they break under concurrency, multi-service deployment, and production load — and when to adopt a dedicated budget authority." +--- + +# Cycles vs Custom Token Counters: Build vs Buy for Agent Budget Control + +Every team that runs AI agents in production eventually builds a token counter. + +It starts the same way every time. + +A developer adds a variable. After each LLM call, increment the counter by the number of tokens used. Before the next call, check if the counter has exceeded the limit. + +```python +if total_tokens < max_tokens: + response = call_llm(prompt) + total_tokens += response.usage.total_tokens +else: + raise BudgetExceeded() +``` + +This works. For a while. + +It works when you have one service, one process, one agent, and one developer who understands the counter. It stops working when any of those assumptions change. + +This article explains where custom token counters break, why they break, and when to replace them with a dedicated budget authority. + +## The natural starting point + +Building your own counter is the rational first move. + +The requirements seem simple: + +- Track how many tokens each run uses +- Stop the run when it exceeds a limit +- Maybe track per-tenant usage for billing + +A database column, a Redis key, or even an in-memory variable can handle this. The implementation takes an afternoon. It ships quickly. It solves the immediate problem. + +Teams that build counters are not doing anything wrong. They are responding to a real need with the simplest possible solution. + +The problems emerge later, when the system grows. + +## Where custom counters break + +### Concurrency: read-then-increment is a race condition + +The basic counter pattern is: read the current value, check if it is under the limit, proceed, then increment. + +That is a textbook time-of-check-to-time-of-use (TOCTOU) race condition. + +When two agent threads run concurrently: + +1. Thread A reads the counter: 900 tokens used out of 1,000 limit. +2. Thread B reads the counter: 900 tokens used out of 1,000 limit. +3. Both threads see headroom. Both proceed. +4. Thread A's call uses 200 tokens. Thread B's call uses 200 tokens. +5. Actual total: 1,300 tokens. Budget exceeded by 30%. + +This is not a theoretical concern. It is the most common bug in custom counter implementations. + +Solving it correctly requires atomic compare-and-swap operations, database-level locking, or serialized access. Most ad hoc counters do not implement any of these. Even when they do, the implementation is often subtly wrong — it works under light load and breaks under production concurrency. + +### Multi-process and multi-service: the counter is local + +A counter stored in application memory only exists in one process. + +When the system scales to multiple instances, each instance has its own counter. The budget is effectively multiplied by the number of instances. Three replicas of a service, each with a 1,000-token limit, actually allow 3,000 tokens. + +Moving the counter to a shared store (Redis, PostgreSQL) solves the locality problem but introduces the concurrency problem. Now every read-check-increment must be atomic across a network boundary. Latency, retries, and connection failures add complexity. + +Moving to a shared store also means every service that makes LLM calls needs to know about the counter, use the same key scheme, and handle failures consistently. That coordination cost grows with each new service. + +### No reservation model: cannot hold budget for in-flight work + +Custom counters typically track what has been used. They do not track what is currently in flight. + +Consider an agent that has used 800 of its 1,000 token budget. It starts a new LLM call that is estimated to use 150 tokens. While that call is in flight, another thread checks the counter and sees 800. It also starts a call. Both calls complete. The total is 1,100. + +The counter was accurate at the time of the check. It just did not account for work that was already happening. + +A reservation model solves this. Before the call, the system reserves 150 tokens. The counter immediately reflects 950 (800 used + 150 reserved). The next thread sees 950 and knows the budget is nearly exhausted. + +After the call completes, the reservation is committed at the actual cost. If the call used only 120 tokens, the remaining 30 are released. + +Building a correct reservation model on top of a simple counter is a significant engineering effort. It requires atomic reservation, commit, release, and TTL-based expiry for reservations that never complete. Most teams do not build this. + +### No hierarchical scopes + +A counter tracks one number against one limit. + +Production systems need limits at multiple levels: + +- **Tenant level:** This customer may spend $500 per month. +- **Workspace level:** This workspace may spend $100 per day. +- **Workflow level:** This workflow type may spend $10 per execution. +- **Run level:** This specific run may spend $2. +- **Action level:** This individual LLM call may spend $0.50. + +Enforcing all of these simultaneously means a single action must check budget at five levels before proceeding. Each level must be decremented atomically. If any level is insufficient, the action must be denied. + +Building this with ad hoc counters means maintaining five separate counters per action, with correct rollup logic, atomic multi-key operations, and consistent error handling. The complexity is substantial. + +Cycles supports hierarchical scopes natively. A single reservation checks all applicable scopes in one atomic operation. + +### No overage policies + +A custom counter has two states: under budget and over budget. The response is binary: proceed or fail. + +Production systems need more nuance. + +When a tenant is approaching their budget limit, the right response might not be "stop." It might be: + +- Switch from GPT-4 to GPT-3.5 (cheaper, faster, good enough for this task) +- Reduce the context window from 128K tokens to 16K tokens +- Skip the optional document enrichment step +- Return a cached response instead of a live inference +- Allow the action but flag it for review + +This is graceful degradation. It keeps the system running at reduced capability instead of failing hard. + +Cycles supports this through its three-way decision model. When budget is low, the system returns ALLOW_WITH_CAPS instead of DENY. The caller receives structured guidance on how to proceed with reduced resources. + +Implementing this on top of a custom counter requires the counter to return not just "yes" or "no" but also "how much is left" and "what constraints apply." That turns a simple counter into a policy engine. Most teams do not make that investment. + +### Maintenance burden: every new service needs the same logic + +When one service has a token counter, it works fine. + +When five services have token counters, each implemented slightly differently, the system has five potential sources of budget accounting bugs. + +Service A uses Redis with INCR. Service B uses PostgreSQL with a row lock. Service C uses an in-memory counter because "it's just a prototype." Service D was supposed to add a counter but the team ran out of time. + +The result is inconsistent enforcement, duplicated logic, and fragile coordination. Every new service that makes LLM calls must re-implement the counter pattern, or integrate with whichever shared counter exists, or — most commonly — skip it and hope for the best. + +Cycles centralizes budget authority in one service. Every client integrates through the same protocol. The budget logic lives in one place. New services call the same API. There is one source of truth for budget state. + +## Comparison + +| | In-App Counter | Cycles | +|---|---|---| +| **Concurrency safety** | Race conditions under parallel access | Atomic reservations — no TOCTOU bugs | +| **Multi-service** | Counter is local to one process or requires custom shared store | Centralized budget authority accessible from any service | +| **Reservation model** | None — tracks past usage, not in-flight work | Reserve before execution, commit after, release on cancel | +| **Hierarchical scopes** | Flat — one counter, one limit | Nested — tenant → workspace → workflow → run → action | +| **Overage policies** | Binary — allow or deny | Three-way — ALLOW, ALLOW_WITH_CAPS, DENY | +| **Maintenance** | Duplicated across services, each with its own bugs | Single integration point, one protocol, one source of truth | +| **Retry handling** | Fragile — retries may double-count or skip counting | Idempotent — retries tied to the same reservation lifecycle | +| **TTL and expiry** | Manual cleanup if at all | Built-in reservation TTL with automatic expiry and release | +| **Audit trail** | Application logs, if instrumented | Structured reservation and commit records | + +## The inflection point: when to move from counters to Cycles + +Custom counters are not always wrong. They are a valid solution at a certain scale. + +The inflection point comes when one or more of these conditions appear. + +### Multiple services making LLM calls + +Once budget enforcement must span more than one service, a local counter is no longer sufficient. The coordination cost of keeping multiple counters consistent exceeds the cost of adopting a centralized authority. + +### Multi-tenant deployment + +When different tenants share the same infrastructure and need independent budget limits, the counter must become tenant-aware. Multiplied by hierarchical scopes (tenant, workspace, run), the counter logic becomes a budget system whether you intended to build one or not. + +### Production concurrency + +When agents run in parallel — multiple threads, multiple instances, multiple workflows — the TOCTOU race condition becomes a real source of overspend. Solving it correctly with custom code requires careful engineering that is hard to get right and easy to break during refactoring. + +### Need for graceful degradation + +When the business requires more than hard cutoffs — when "switch to a cheaper model" is the right response instead of "error 403" — a binary counter is no longer expressive enough. + +### Compliance or audit requirements + +When the organization needs to demonstrate that every LLM call was authorized against a budget, with a clear trail of reservations and commits, ad hoc counters do not provide the necessary structure. + +If none of these apply, a custom counter may be all you need. Not every system requires a dedicated budget authority. A prototype, a single-service application with low concurrency, or an internal tool with one user can work fine with a simple counter. + +But if two or more of these conditions are present, the custom counter is likely accumulating correctness debt faster than the team can repay it. + +## Migration path + +Moving from custom counters to Cycles does not require a big-bang migration. + +### Step 1: Deploy in shadow mode + +Start Cycles in shadow mode alongside your existing counters. Cycles evaluates budget decisions but does not enforce them. Both systems run in parallel. + +Compare the decisions. Does Cycles agree with your counter? Where do they diverge? Divergences usually reveal bugs in the custom counter — race conditions, missing scope checks, or inconsistent state. + +See [Shadow Mode Rollout](/how-to/shadow-mode-in-cycles-how-to-roll-out-budget-enforcement-without-breaking-production) for a detailed guide. + +### Step 2: Validate scope configuration + +Configure Cycles with the same budget limits your counters enforce. Map your counter keys to Cycles scopes. Verify that the hierarchical scopes (tenant, workspace, workflow, run) match your application's budget structure. + +### Step 3: Enable enforcement on one service + +Pick the lowest-risk service — the one with the simplest counter logic and the least concurrency. Switch it from the custom counter to Cycles. Monitor for a week. + +### Step 4: Roll out to remaining services + +Move each service from its custom counter to Cycles. With each migration, the custom counter code can be removed. The budget logic converges on a single integration point. + +### Step 5: Remove the custom counters + +Once all services use Cycles, the custom counter code can be deleted. No more duplicated logic, no more inconsistent enforcement, no more race conditions in hand-rolled concurrency handling. + +The result is a system where budget authority is centralized, concurrency-safe, and consistent across every service that makes LLM calls. + +## The build vs buy calculation + +Building a custom counter is cheap at first. The initial implementation takes hours. + +Maintaining it under production conditions costs more than most teams expect: + +- Debugging race conditions that only manifest under load +- Coordinating counter logic across services during refactors +- Adding hierarchical scopes after the fact +- Building reservation semantics on top of a simple increment +- Handling edge cases around retries, crashes, and partial failures +- Explaining to the team why the budget numbers do not add up + +Cycles is designed to handle these concerns from the start. It is not a better counter. It is a different primitive — a budget authority with reservation semantics, hierarchical scopes, and concurrency safety built in. + +The question is not whether you can build it yourself. You can. + +The question is whether budget accounting is where your team should be spending its engineering time. + +## Next steps + +- Read the [Cycles Protocol](https://github.com/runcycles/cycles-protocol) +- Run the [Cycles Server](https://github.com/runcycles/cycles-server) +- Integrate with Python using the [Python Client](/quickstart/getting-started-with-the-python-client) +- Integrate with TypeScript using the [TypeScript Client](/quickstart/getting-started-with-the-typescript-client) +- Try the [End-to-End Tutorial](/quickstart/end-to-end-tutorial) — zero to a working budget-guarded LLM call in ten minutes diff --git a/concepts/cycles-vs-guardrails-ai.md b/concepts/cycles-vs-guardrails-ai.md new file mode 100644 index 0000000..297732e --- /dev/null +++ b/concepts/cycles-vs-guardrails-ai.md @@ -0,0 +1,217 @@ +--- +title: "Cycles vs Guardrails AI: Budget Authority vs Content Safety" +description: "Guardrails AI validates LLM outputs for content safety. Cycles enforces budget limits before execution. They solve different problems and work well together." +--- + +# Cycles vs Guardrails AI: Budget Authority vs Content Safety + +Guardrails AI and Cycles both sit in the path of LLM execution. + +They both add control. They both can prevent bad outcomes. + +But they control different things entirely. + +Guardrails AI validates **what the model says**. + +Cycles controls **whether the model gets called at all**. + +One is about content safety. The other is about budget authority. They operate at different points in the execution lifecycle, solve different problems, and complement each other cleanly. + +## What Guardrails AI does + +Guardrails AI is a framework for validating LLM inputs and outputs. It wraps model calls with validators that check whether the response meets defined safety and quality criteria. + +Its core capabilities include: + +### Output validation + +Guardrails checks whether an LLM response meets structural and content requirements. Does the output match a schema? Does it contain required fields? Is the JSON well-formed? + +### Content safety rails + +Guardrails can detect and filter harmful content — toxicity, bias, personally identifiable information, profanity, or any content that violates a policy. It intercepts unsafe outputs before they reach the user. + +### Schema enforcement + +When an application expects structured output from an LLM, Guardrails ensures the response conforms to a defined schema. If the output is malformed, Guardrails can retry the call or return a corrected version. + +### Prompt injection detection + +Guardrails can identify attempts to manipulate the model through adversarial inputs. It adds a layer of defense against prompt injection attacks that try to override system instructions. + +### Retry and re-ask logic + +When validation fails, Guardrails can automatically retry the LLM call, optionally re-asking with a corrected prompt. This creates a feedback loop that improves output quality without manual intervention. + +These are valuable capabilities. Content safety and output quality are real problems that need real solutions. + +But none of these capabilities address the question: should this model call happen at all, given what the system has already spent? + +## What Cycles does + +Cycles is a budget authority for autonomous execution. It enforces cost limits before work begins, using a reserve-then-commit lifecycle. + +Its core capabilities include: + +### Pre-execution budget enforcement + +Before an agent calls a model, Cycles checks whether sufficient budget remains. If the budget is exhausted, the call does not happen. The decision is made before any cost is incurred. + +### Reserve-then-commit lifecycle + +Cycles does not just track spend after the fact. It reserves estimated cost before execution, then commits actual cost afterward. Unused budget is released automatically. This prevents concurrent requests from racing past a budget limit. + +### Concurrency-safe budget tracking + +When multiple agent threads or workflows run in parallel, Cycles uses atomic reservations to prevent overspend. Two threads cannot both claim the last $5 of budget — the reservation is atomic. + +### Hierarchical scope enforcement + +Budgets can be enforced at multiple levels simultaneously: tenant, workspace, workflow, run, and action. A single reservation can check all applicable scopes in one operation. + +### Three-way decisions + +Instead of a binary allow/deny, Cycles supports three responses: + +- **ALLOW** — budget is sufficient, proceed normally +- **ALLOW_WITH_CAPS** — budget is low, proceed with constraints (use a cheaper model, skip optional steps) +- **DENY** — budget is exhausted, do not proceed + +This enables graceful degradation instead of hard failures. + +## The key difference + +Guardrails AI and Cycles ask fundamentally different questions. + +**Guardrails asks:** Is this LLM output safe, correct, and well-formed? + +**Cycles asks:** Is this LLM call authorized to execute given the remaining budget? + +Guardrails operates on content. It examines what the model produced and decides whether that content should be passed through, corrected, or blocked. + +Cycles operates on economics. It examines the budget state and decides whether the model should be invoked at all. + +A model call can pass Guardrails validation (the output is safe and well-formed) while failing Cycles enforcement (the budget is exhausted). And vice versa — a call can be authorized by Cycles (budget is available) while being flagged by Guardrails (the output contains PII). + +These are independent concerns. Neither subsumes the other. + +## Comparison + +| | Guardrails AI | Cycles | +|---|---|---| +| **Primary concern** | Content safety and output quality | Budget governance and cost control | +| **When it acts** | After LLM response (output validation) or before call (input validation) | Before LLM call (pre-execution budget check) | +| **What it prevents** | Toxic content, schema violations, prompt injection, PII leakage | Budget overruns, unbounded spend, cost race conditions | +| **Concurrency model** | Per-request validation (stateless) | Atomic reservations across concurrent requests (stateful) | +| **Budget awareness** | None — does not track cost or spend | Core function — reserves, commits, and tracks budget across scopes | +| **Protocol** | Python framework with validators and guards | Open protocol with reserve/commit/release lifecycle | +| **Retry behavior** | Re-asks the model with corrected prompts | Idempotent reservations — retries do not double-spend | +| **Scope** | Per-call input/output validation | Per-tenant, per-workflow, per-run hierarchical budgets | +| **Degradation** | Can correct or filter outputs | Can downgrade model choice, reduce scope, or deny execution | + +## Where Guardrails AI falls short for budget control + +Guardrails AI is not designed for cost governance. That is not a criticism — it is a scope observation. + +### No cumulative cost tracking + +Guardrails validates each call independently. It does not maintain a running total of how much a workflow, run, or tenant has spent. It cannot answer: "Should we stop calling the model because this run has already consumed $8 of its $10 budget?" + +### No pre-execution cost check + +Guardrails primarily acts on the output side. It checks the response after the model has been called. By then, the cost has already been incurred. Even its input validators do not perform budget checks. + +### No reservation semantics + +Guardrails has no concept of reserving budget before execution and committing actual cost afterward. It cannot prevent two concurrent calls from exceeding a shared budget because it does not track budgets at all. + +### No hierarchical budget scopes + +Guardrails does not enforce limits at the tenant, workspace, or workflow level. It operates on individual model calls without cross-call or cross-scope awareness. + +### Retries increase cost + +When Guardrails re-asks the model after a validation failure, that retry costs money. There is no budget check before the retry. If the model fails validation five times, the system pays for five calls — regardless of whether the budget can absorb them. + +## Where Cycles falls short for content safety + +Cycles is not designed for content validation. That is equally intentional. + +### No output inspection + +Cycles does not examine what the model said. It does not know whether the response contains PII, toxic language, or malformed JSON. It authorized the call to happen. What the model produces is outside its scope. + +### No schema enforcement + +Cycles does not validate whether LLM output matches a required structure. It governs execution economics, not output structure. + +### No prompt injection detection + +Cycles does not inspect prompts or responses for adversarial manipulation. That is a content-layer concern, not a budget-layer concern. + +### No content filtering + +Cycles cannot detect or remove harmful content from model responses. It does not operate on content at all. + +## Using both together + +Guardrails AI and Cycles sit at different points in the execution path. They complement each other naturally. + +The flow looks like this: + +``` +Agent decides to call an LLM + → Cycles: Is there budget for this call? + → DENY → Do not call the model. Return a fallback or error. + → ALLOW_WITH_CAPS → Call a cheaper model or reduce context. + → ALLOW → Proceed with the intended model. + → LLM call executes + → Guardrails: Is this output safe and well-formed? + → FAIL → Re-ask or return corrected output. + (Each retry also checks Cycles for budget.) + → PASS → Return output to the caller. + → Cycles: Commit actual cost. Release unused reservation. +``` + +This creates two complementary control layers: + +1. **Budget check first (Cycles).** Before spending money, verify that the budget allows it. This prevents wasted cost on calls that should never have happened. + +2. **Content check second (Guardrails).** After getting a response, verify that it meets safety and quality standards. This prevents unsafe or malformed content from reaching users. + +The critical detail is in the retry loop. When Guardrails triggers a re-ask, that retry should also pass through Cycles. Otherwise, repeated validation failures can create unbounded cost — the model keeps getting called, failing validation, and retrying, with no budget check on each retry. + +### Example: a customer support agent + +Consider an AI agent that handles customer inquiries. + +**Without either tool:** The agent calls GPT-4 for every message. A confused customer sends 50 messages in a long conversation. The agent loops through tool calls, retries, and multi-step reasoning. The run costs $30. The output occasionally contains PII from the CRM lookup. Nobody catches either problem until after the fact. + +**With Guardrails only:** The agent's outputs are validated for PII and toxicity. Content safety is handled. But the agent still loops through expensive calls without limit. The $30 run still happens. + +**With Cycles only:** The agent's budget is capped at $5 per run. After $5, the agent degrades to a cheaper model or stops. Cost is controlled. But the outputs are not checked for PII or safety violations. + +**With both:** The agent's budget is capped at $5 per run (Cycles). Each output is validated for PII and safety (Guardrails). Retries triggered by Guardrails are checked against the remaining budget (Cycles). The system is both safe and economical. + +## Different problems, different layers + +It is tempting to look for one tool that handles everything. That is not how production systems work. + +Content safety and budget authority are independent concerns: + +- A safe output can be too expensive. +- A cheap output can be unsafe. +- A well-formed response can come from a run that already exceeded its budget. +- A budget-compliant run can produce toxic content. + +Guardrails AI solves the content problem. Cycles solves the cost problem. Together, they give teams control over both what the model says and how much it costs to say it. + +Neither tool is optional if you care about both. + +## Next steps + +- Read the [Cycles Protocol](https://github.com/runcycles/cycles-protocol) +- Run the [Cycles Server](https://github.com/runcycles/cycles-server) +- Integrate with Python using the [Python Client](/quickstart/getting-started-with-the-python-client) +- Integrate with TypeScript using the [TypeScript Client](/quickstart/getting-started-with-the-typescript-client) +- Try the [End-to-End Tutorial](/quickstart/end-to-end-tutorial) — zero to a working budget-guarded LLM call in ten minutes diff --git a/concepts/cycles-vs-provider-spending-caps.md b/concepts/cycles-vs-provider-spending-caps.md new file mode 100644 index 0000000..131fb49 --- /dev/null +++ b/concepts/cycles-vs-provider-spending-caps.md @@ -0,0 +1,217 @@ +--- +title: "Cycles vs Provider Spending Caps: Why Platform Limits Are Not Enough" +description: "OpenAI, Anthropic, and Google all offer spending limits — but they are monthly, org-wide, and delayed. See why teams need finer-grained budget authority." +--- + +# Cycles vs Provider Spending Caps: Why Platform Limits Are Not Enough + +Every major LLM provider offers some form of spending control. + +OpenAI has usage limits. Anthropic has spending limits. Google Cloud has budget alerts and quotas. AWS Bedrock has service quotas. + +These exist for good reason. They prevent surprise bills at the organizational level. They are a safety net. + +But a safety net is not a governance system. + +Provider spending caps operate at the wrong granularity, the wrong timing, and the wrong scope for teams running autonomous AI agents in production. + +## What provider caps offer + +Provider spending caps vary by vendor, but the general pattern is consistent. + +### OpenAI usage limits + +OpenAI allows organizations to set monthly spend limits on their API usage. When the limit is reached, API calls are rejected. Organizations can also set per-project or per-API-key budgets. Usage data is available through a dashboard with some reporting delay. + +### Anthropic spending limits + +Anthropic provides workspace-level spending limits. Organizations can set monthly caps that hard-block API access once reached. Usage is tracked at the workspace level and visible through the console. + +### Google Cloud budget alerts + +Google Cloud offers budget alerts for Vertex AI and other services. These are primarily notification-based — they send alerts at defined thresholds (50%, 90%, 100%) but do not automatically block usage. Actual enforcement requires additional configuration through quota policies. + +### AWS Bedrock service quotas + +AWS provides service quotas that limit tokens per minute and requests per minute for Bedrock models. These are throughput limits, not spend limits. Cost governance requires separate AWS Budgets configuration, which is alert-based with optional automated actions. + +### The common thread + +All of these share a similar shape: + +- Organization-wide or workspace-wide scope +- Monthly or daily granularity +- Delayed usage reporting +- Binary enforcement (all traffic blocked, or nothing) +- Single-provider visibility + +For basic protection against runaway API bills, they work. That is their purpose. + +The problem starts when teams need more than basic protection. + +## Why provider caps are not sufficient + +### Monthly or daily granularity — not per-run, not per-workflow + +Provider caps operate on calendar time. You set a monthly limit or a daily limit. + +But autonomous agents operate in runs. A single agent run might take 30 seconds and make 15 LLM calls. Another run might take 4 hours and make 300 calls. The cost difference between these runs can be orders of magnitude. + +A monthly cap cannot express: "This run may spend at most $5." It can only express: "This organization may spend at most $10,000 this month." + +That means a single runaway run can consume a significant portion of the monthly budget before anyone notices. The cap will eventually trigger, but not before damage is done. + +### Org-wide scope — not per-tenant, not per-user, not per-workspace + +Provider caps apply to the organization or API key. They cannot distinguish between tenants sharing the same infrastructure. + +If you run a multi-tenant platform where each customer gets their own AI agent, a provider cap cannot enforce per-customer budgets. One customer's runaway agent can exhaust the cap for all customers. + +This is the most common gap teams discover. They have 50 tenants sharing one OpenAI API key. The monthly cap is set at $50,000. One tenant's agent loops overnight and consumes $8,000. The provider cap does not know or care which tenant caused it. It only knows the organization-level total. + +### Delayed enforcement + +Provider usage data is not real-time. + +OpenAI usage updates can lag by minutes. Anthropic and Google have similar delays. AWS Bedrock usage data flows through CloudWatch, which adds its own latency. + +That means a cap set at $1,000 might not trigger until actual spend reaches $1,050 or $1,100, depending on the velocity of requests and the reporting delay. + +For autonomous agents making rapid successive calls, this delay can be significant. An agent can make dozens of expensive calls in the minutes between usage updates. + +### No pre-execution check + +Provider caps are reactive. + +The model call happens. The tokens are consumed. The cost is recorded. Then the cap is checked. + +There is no mechanism to ask: "Does this organization have enough budget for this specific call?" before the call executes. + +That means the system always incurs at least one over-budget call before enforcement kicks in. Under high concurrency, it can incur many. + +Cycles inverts this. Budget is reserved before execution. If the budget is insufficient, the call never happens. Zero cost is incurred for denied requests. + +### No graceful degradation + +When a provider cap triggers, all API calls fail. + +There is no middle ground. The system goes from fully operational to completely blocked. Every agent, every workflow, every tenant — all stopped at once. + +This is the equivalent of a circuit breaker with no dimmer switch. + +Production systems need nuance: + +- Switch to a cheaper model when budget is low +- Reduce context window size +- Skip optional enrichment steps +- Serve cached responses instead of live inference +- Degrade gracefully for low-priority workflows while keeping high-priority ones running + +Provider caps cannot express any of this. They have one response: block everything. + +Cycles supports three-way decisions (ALLOW, ALLOW_WITH_CAPS, DENY) that enable graceful degradation at the per-action level. A workflow can continue with reduced capability instead of failing completely. + +### Multi-provider blind spots + +Most teams do not use a single LLM provider. + +A typical production stack might include: + +- OpenAI for GPT-4 and embeddings +- Anthropic for Claude +- Google for Gemini +- A local model for low-latency classification + +Each provider tracks its own usage independently. None of them know about spend on the other providers. + +A team that has budgeted $500 per day across all providers has no single place to enforce that limit. OpenAI knows about OpenAI spend. Anthropic knows about Anthropic spend. Neither knows the total. + +Cycles aggregates budget across providers. A single reservation can account for the expected cost of any model call, regardless of which provider serves it. The budget boundary is defined by the application, not by the vendor. + +## Comparison + +| | Provider Cap | Cycles | +|---|---|---| +| **Granularity** | Monthly or daily, per-organization | Per-tenant, per-workspace, per-workflow, per-run, per-action | +| **Scope** | Organization or API key | Hierarchical — tenant → workspace → workflow → run | +| **Enforcement timing** | Post-usage with reporting delay | Pre-execution — budget reserved before the call | +| **Multi-provider** | Single provider only | Aggregates across all providers in one budget | +| **Degradation** | Binary — all traffic blocked or all allowed | Three-way — ALLOW, ALLOW_WITH_CAPS, DENY | +| **Protocol** | Vendor-specific dashboard and API | Open protocol with reserve/commit/release lifecycle | +| **Concurrency handling** | Delayed counter — race conditions under load | Atomic reservations — no overspend under concurrency | +| **Per-tenant enforcement** | Not supported | Built-in hierarchical scopes | +| **Retry awareness** | None — each retry is a new charge | Idempotent reservations — retries do not double-spend | + +## The delay problem in detail + +The reporting delay deserves special attention because it is the subtlest failure mode. + +Consider an agent making calls at a steady rate of one per second. Each call costs approximately $0.10. The provider cap is set at $100. + +At second 1,000, the agent has spent $100. But the provider's usage dashboard reflects spend as of second 940 — a 60-second reporting delay. The cap has not triggered. + +The agent makes 60 more calls before the cap catches up. That is $6 of overspend — a 6% overrun. + +Now increase the call rate. Five calls per second, each costing $0.50. At the same 60-second delay, that is 300 calls and $150 of overspend on a $100 cap — a 150% overrun. + +This is not a bug. It is an inherent limitation of post-hoc enforcement with delayed reporting. + +Cycles avoids this entirely. Budget is reserved before execution. The reservation is atomic and immediate. There is no delay between the budget check and the budget decrement. + +## When to use both + +Provider caps and Cycles are not mutually exclusive. They serve as different layers of defense. + +### Keep provider caps as a safety net + +Provider caps are your last line of defense. If everything else fails — if Cycles is misconfigured, if a bug bypasses the budget check, if a new service is deployed without integration — the provider cap catches it. + +Set your provider caps at a level that represents your absolute maximum acceptable spend. This is the "something has gone badly wrong" threshold. + +### Use Cycles for operational control + +Cycles is your operational layer. It enforces the budgets that matter to your business: + +- Per-tenant limits that align with pricing tiers +- Per-workflow limits that prevent individual runs from spiraling +- Per-run limits that bound the cost of any single agent execution +- Degradation policies that keep the system running under budget pressure + +This is the layer that runs day-to-day. It handles the normal case, the edge cases, and the concurrent cases. + +### Defense in depth + +The combination creates defense in depth: + +1. **Cycles** handles per-tenant, per-run, per-workflow budget enforcement with pre-execution checks. This is the primary control layer. +2. **Provider caps** handle organizational safety nets. They catch anything that slips through the primary layer. + +If Cycles is working correctly, provider caps should never trigger. They exist for the case where Cycles is not working correctly. + +That is good engineering. Multiple independent layers of control, each catching different failure modes. + +## Migration path + +Teams that currently rely on provider caps alone can adopt Cycles incrementally. + +**Step 1: Shadow mode.** Deploy Cycles in shadow mode. It evaluates budget decisions but does not enforce them. Log the decisions. Compare what Cycles would have done against what actually happened. + +**Step 2: Validate.** Review the shadow mode data. Are the budget allocations correct? Are the scope hierarchies right? Would enforcement have blocked legitimate work? Adjust the configuration. + +**Step 3: Enforce on new workflows.** Enable enforcement for new or low-risk workflows first. Keep shadow mode on everything else. + +**Step 4: Expand enforcement.** Gradually move more workflows from shadow mode to enforcement as confidence builds. + +**Step 5: Adjust provider caps.** Once Cycles is handling operational budget control, raise your provider caps to be true safety nets — generous enough to never trigger under normal operation, strict enough to catch genuine failures. + +Provider caps become your fire alarm. Cycles becomes your thermostat. + +One prevents catastrophe. The other maintains comfortable operating conditions. + +## Next steps + +- Read the [Cycles Protocol](https://github.com/runcycles/cycles-protocol) +- Run the [Cycles Server](https://github.com/runcycles/cycles-server) +- Integrate with Python using the [Python Client](/quickstart/getting-started-with-the-python-client) +- Integrate with TypeScript using the [TypeScript Client](/quickstart/getting-started-with-the-typescript-client) +- Try the [End-to-End Tutorial](/quickstart/end-to-end-tutorial) — zero to a working budget-guarded LLM call in ten minutes diff --git a/concepts/cycles-vs-rate-limiting.md b/concepts/cycles-vs-rate-limiting.md new file mode 100644 index 0000000..b36e80f --- /dev/null +++ b/concepts/cycles-vs-rate-limiting.md @@ -0,0 +1,205 @@ +--- +title: "Cycles vs Rate Limiting: Why Velocity Controls Fail for AI Agents" +description: "Rate limiters control request velocity but cannot govern total spend. See how Cycles adds cost-aware, pre-execution budget authority where rate limits fall short." +--- + +# Cycles vs Rate Limiting: Why Velocity Controls Fail for AI Agents + +Rate limiting is one of the most widely deployed control patterns in software. + +It works. It has worked for decades. + +But it was designed for a different problem than the one AI agents create. + +Rate limiters answer: **how fast?** + +Cycles answers: **how much?** + +That distinction determines whether your system can burn through $10,000 overnight while staying perfectly within its RPM limit. + +## What rate limiting does well + +Rate limiters are effective at three things. + +### Abuse prevention + +A rate limiter keeps a bad actor from hammering your API. It sets a ceiling on request velocity per caller, per endpoint, or per time window. That is essential for any public-facing service. + +### Traffic shaping + +Rate limiters smooth bursty traffic. They protect downstream services from sudden spikes, keep queue depths manageable, and help maintain latency targets under load. + +### Fairness + +In multi-tenant systems, rate limiters ensure one tenant cannot monopolize shared resources. Every caller gets a fair share of throughput. + +These are real, valuable properties. Nothing in this article suggests removing your rate limiter. + +The question is whether rate limiting alone is sufficient when autonomous agents enter the picture. + +It is not. + +## Where rate limiting fails for AI agents + +AI agents break the assumptions that make rate limiting sufficient. + +### Rate limiters do not track cumulative cost + +A rate limiter knows how many requests passed through in the last minute. It does not know how much those requests cost in total. + +An agent that makes 10 requests per minute stays within a 60 RPM limit. But if each request triggers a long-context GPT-4 call with tool use, the cost per request might be $0.50 or more. That is $300 per hour. $7,200 per day. All within the rate limit. + +The rate limiter sees normal traffic. The bill tells a different story. + +### Rate limiters cannot distinguish cheap calls from expensive calls + +To a rate limiter, every request is identical. A call that uses 100 input tokens and a call that uses 100,000 input tokens count the same: one request. + +This is the fundamental mismatch. AI workloads have extreme cost variance between requests. A simple classification call might cost $0.001. A multi-turn agentic workflow with tool calls might cost $5.00. Both are one request. + +Rate limiting treats them identically. Budget authority cannot afford to. + +### Rate limiters have no per-run or per-workflow awareness + +A rate limiter operates at the connection level. It does not know that five requests belong to the same agent run, or that a workflow has fanned out into twelve parallel sub-tasks. + +It cannot enforce: "this workflow may only spend $2 total." It can only enforce: "this caller may make N requests per time window." + +That means an agent can spawn sub-tasks, retry failed steps, and loop through tool calls — all within the rate limit — while the total cost of a single run spirals. + +### An agent can stay within RPM limits and burn $10K overnight + +This is not a theoretical risk. It is the most common failure mode teams report. + +The agent is well-behaved. It respects rate limits. It does not spike. It does not look like abuse. + +It simply runs continuously, making steady, moderately expensive calls. Each call is allowed. The total is not governed. + +By morning, the bill is $10,000. Nothing in the rate limiter flagged it. + +The problem is not velocity. The problem is unbounded cumulative spend. + +### No graceful degradation + +When a rate limiter triggers, it returns 429 Too Many Requests. The client backs off and retries. + +That is a binary response: allowed or throttled. + +AI agents need a richer vocabulary. Sometimes the right answer is not "stop" but "continue with a cheaper model." Or "reduce the number of tool calls." Or "skip the optional enrichment step." + +Rate limiters cannot express these nuances. They have one lever: velocity. + +## Comparison + +| | Rate Limiter | Cycles | +|---|---|---| +| **Controls** | Request velocity (RPM, RPS) | Total budgeted exposure (cost, tokens, units) | +| **Granularity** | Per-caller, per-endpoint, per-time-window | Per-tenant, per-workspace, per-workflow, per-run | +| **Cost-aware** | No — every request counts equally | Yes — reserves estimated cost, commits actual cost | +| **Pre-execution budget check** | Velocity only — no cumulative awareness | Yes — checks remaining budget across all scopes before execution | +| **Concurrency-safe** | Yes for velocity counting | Yes — atomic reservations prevent race conditions on budget | +| **Degradation support** | No — binary allow/throttle | Yes — three-way decision: ALLOW, ALLOW_WITH_CAPS, DENY | + +## How Cycles works where rate limiting cannot + +Cycles introduces a reserve-then-commit model that is fundamentally different from velocity counting. + +Before an agent action executes: + +1. The system declares how much budget the action is expected to consume. +2. Cycles checks whether that budget is available across all applicable scopes (tenant, workspace, workflow, run). +3. If available, the budget is atomically reserved. No other concurrent request can claim the same budget. +4. The action executes. +5. After execution, the system commits the actual cost. If the actual cost is less than the reservation, the remainder is released automatically. + +This model answers questions that rate limiters cannot: + +- Has this run already consumed too much? Then deny or degrade the next step. +- Is the tenant approaching its daily limit? Then switch to a cheaper model. +- Are concurrent requests about to exceed the workflow budget? The reservation is atomic — only one will succeed. +- Did the action cost less than expected? The unused budget is released for other work. + +## The concurrency problem + +Rate limiters handle concurrency well for velocity. They are designed for it. + +But budget governance under concurrency is a different problem. + +Consider two agent threads running in parallel against the same workflow budget. The budget has $5 remaining. Both threads check the budget, both see $5 available, and both proceed. Total spend: $10 against a $5 budget. + +This is a classic time-of-check-to-time-of-use (TOCTOU) race condition. Rate limiters do not protect against it because they do not track cumulative spend. + +Cycles handles this with atomic reservations. When the first thread reserves $5, that budget is immediately unavailable to the second thread. The second thread's reservation attempt sees the reduced balance and can be denied or degraded. + +No race condition. No overspend. + +## When to use both together + +Rate limiting and Cycles solve different problems. Most production systems should use both. + +**Keep your rate limiter for:** + +- Abuse prevention — stopping bad actors from flooding your API +- Traffic shaping — smoothing bursts to protect downstream services +- Fairness — ensuring no single caller monopolizes throughput +- DDoS mitigation — absorbing malicious traffic spikes + +**Add Cycles for:** + +- Budget governance — bounding total spend per tenant, workflow, or run +- Cost-aware decisions — distinguishing cheap calls from expensive ones +- Graceful degradation — downgrading to cheaper models when budget is low +- Pre-execution enforcement — stopping expensive work before it starts +- Concurrency-safe accounting — preventing race conditions on budget + +The two sit at different points in the request path. + +A rate limiter typically sits at the edge — at the API gateway or load balancer. It decides whether the request may enter the system at all. + +Cycles sits inside the application logic — at the point where an agent is about to make an expensive decision. It decides whether that specific action is allowed given the current budget state. + +A request can pass the rate limiter (it is within velocity limits) and still be denied by Cycles (the budget is exhausted). These are independent, complementary checks. + +## The architecture in practice + +A typical flow looks like this: + +``` +Request arrives + → Rate limiter: within RPM? → Yes → proceed + → Cycles: budget available? → Reserve + → Execute agent action (LLM call, tool use) + → Cycles: commit actual cost, release remainder +``` + +If the rate limiter says no, the request never reaches Cycles. That is correct — abuse prevention should happen first. + +If the rate limiter says yes but Cycles says no, the agent action does not execute. That is also correct — the work is within velocity limits but exceeds budget. + +If both say yes, the action proceeds with reserved budget. After execution, the actual cost is committed and any unused reservation is released. + +## The key insight + +Rate limiting and budget authority are orthogonal controls. + +Rate limiting governs the speed of requests. It prevents bursts and abuse. It is stateless in the sense that it does not track what those requests cost in aggregate. + +Budget authority governs the total exposure of execution. It prevents overspend and cost runaway. It is stateful — it tracks reservations, commits, and remaining balances across scopes. + +An AI agent that respects rate limits can still create unbounded cost. + +An AI agent governed by Cycles cannot exceed its budget, regardless of how fast or slow it operates. + +Rate limiting answers **how fast?** + +Cycles answers **how much?** + +Production systems need both answers. + +## Next steps + +- Read the [Cycles Protocol](https://github.com/runcycles/cycles-protocol) +- Run the [Cycles Server](https://github.com/runcycles/cycles-server) +- Integrate with Python using the [Python Client](/quickstart/getting-started-with-the-python-client) +- Integrate with TypeScript using the [TypeScript Client](/quickstart/getting-started-with-the-typescript-client) +- Try the [End-to-End Tutorial](/quickstart/end-to-end-tutorial) — zero to a working budget-guarded LLM call in ten minutes diff --git a/how-to/integrating-cycles-with-anthropic.md b/how-to/integrating-cycles-with-anthropic.md index 3536b87..4bf3d1e 100644 --- a/how-to/integrating-cycles-with-anthropic.md +++ b/how-to/integrating-cycles-with-anthropic.md @@ -22,6 +22,26 @@ export ANTHROPIC_API_KEY="sk-ant-..." > **Need an API key?** Create one via the Admin Server — see [Deploy the Full Stack](/quickstart/deploying-the-full-cycles-stack#step-3-create-an-api-key) or [API Key Management](/how-to/api-key-management-in-cycles). +::: tip 60-Second Quick Start +```python +from anthropic import Anthropic +from runcycles import CyclesClient, CyclesConfig, cycles, set_default_client + +set_default_client(CyclesClient(CyclesConfig.from_env())) + +@cycles(estimate=2_000_000, action_kind="llm.completion", action_name="claude-sonnet-4") +def ask(prompt: str) -> str: + return Anthropic().messages.create( + model="claude-sonnet-4-20250514", + max_tokens=1024, + messages=[{"role": "user", "content": prompt}], + ).content[0].text + +print(ask("What is budget authority?")) +``` +Every call is now budget-guarded. If the budget is exhausted, `BudgetExceededError` is raised _before_ the Anthropic call is made. Read on for production patterns with per-token cost tracking and tool-use workflows. +::: + ## Simple decorator pattern Use `@cycles` to wrap a single Anthropic call with automatic reserve → execute → commit: diff --git a/how-to/integrating-cycles-with-langchain.md b/how-to/integrating-cycles-with-langchain.md index 2c65380..4b7e88d 100644 --- a/how-to/integrating-cycles-with-langchain.md +++ b/how-to/integrating-cycles-with-langchain.md @@ -22,6 +22,22 @@ export OPENAI_API_KEY="sk-..." > **Need an API key?** Create one via the Admin Server — see [Deploy the Full Stack](/quickstart/deploying-the-full-cycles-stack#step-3-create-an-api-key) or [API Key Management](/how-to/api-key-management-in-cycles). +::: tip 60-Second Quick Start +```python +from langchain_openai import ChatOpenAI +from langchain_core.messages import HumanMessage +from runcycles import CyclesClient, CyclesConfig, Subject + +client = CyclesClient(CyclesConfig.from_env()) +handler = CyclesBudgetHandler(client=client, subject=Subject(tenant="acme", agent="my-agent")) + +llm = ChatOpenAI(model="gpt-4o", callbacks=[handler]) +result = llm.invoke([HumanMessage(content="What is budget authority?")]) +print(result.content) +``` +Every LLM call — including multi-turn agent loops — is now budget-guarded. See the full `CyclesBudgetHandler` implementation below. +::: + ## The callback handler approach LangChain's callback system fires events on every LLM call. A custom `BaseCallbackHandler` can hook into `on_llm_start` and `on_llm_end` to create and commit Cycles reservations: diff --git a/how-to/integrating-cycles-with-openai.md b/how-to/integrating-cycles-with-openai.md index de91aff..ae31c85 100644 --- a/how-to/integrating-cycles-with-openai.md +++ b/how-to/integrating-cycles-with-openai.md @@ -24,6 +24,25 @@ export OPENAI_API_KEY="sk-..." > **Need an API key?** Create one via the Admin Server — see [Deploy the Full Stack](/quickstart/deploying-the-full-cycles-stack#step-3-create-an-api-key) or [API Key Management](/how-to/api-key-management-in-cycles). +::: tip 60-Second Quick Start +```python +from openai import OpenAI +from runcycles import CyclesClient, CyclesConfig, cycles, set_default_client + +set_default_client(CyclesClient(CyclesConfig.from_env())) + +@cycles(estimate=1_500_000, action_kind="llm.completion", action_name="gpt-4o") +def ask(prompt: str) -> str: + return OpenAI().chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": prompt}], + ).choices[0].message.content + +print(ask("What is budget authority?")) +``` +That's it — every call is now budget-guarded. If the budget is exhausted, `BudgetExceededError` is raised _before_ the OpenAI call is made. Read on for production patterns with accurate cost tracking. +::: + ## Basic pattern Use the `@cycles` decorator to wrap an OpenAI call with automatic reserve → execute → commit: diff --git a/how-to/integrating-cycles-with-vercel-ai-sdk.md b/how-to/integrating-cycles-with-vercel-ai-sdk.md index 4065f46..9150d5e 100644 --- a/how-to/integrating-cycles-with-vercel-ai-sdk.md +++ b/how-to/integrating-cycles-with-vercel-ai-sdk.md @@ -30,6 +30,29 @@ CYCLES_TENANT=acme-corp OPENAI_API_KEY=sk-... ``` +::: tip 60-Second Quick Start +```typescript +import { streamText } from "ai"; +import { openai } from "@ai-sdk/openai"; +import { CyclesClient, CyclesConfig, reserveForStream } from "runcycles"; + +const cycles = new CyclesClient(CyclesConfig.fromEnv()); +const handle = await reserveForStream({ + client: cycles, estimate: 2_000_000, unit: "USD_MICROCENTS", + actionKind: "llm.completion", actionName: "gpt-4o", +}); + +const result = streamText({ + model: openai("gpt-4o"), + prompt: "What is budget authority?", + onFinish: async ({ usage }) => { + await handle.commit((usage.promptTokens ?? 0) * 250 + (usage.completionTokens ?? 0) * 1000); + }, +}); +``` +Budget is reserved before the stream starts and committed when it finishes. Read on for the full Next.js API route pattern with error handling. +::: + ## API route with budget governance Create an API route that reserves budget before streaming and commits actual usage after: diff --git a/public/llms.txt b/public/llms.txt index d2dc00b..8c34a24 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -47,6 +47,16 @@ Use Cycles when you need to: enforce spend limits on LLM calls, bound tool invoc - [Idempotency, Retries, and Concurrency](https://runcycles.io/concepts/idempotency-retries-and-concurrency-why-cycles-is-built-for-real-failure-modes): How Cycles handles real-world failure modes - [From Observability to Enforcement](https://runcycles.io/concepts/from-observability-to-enforcement-how-teams-evolve-from-dashboards-to-budget-authority): How teams evolve from dashboards to budget authority - [How Cycles Compares](https://runcycles.io/concepts/how-cycles-compares-to-rate-limiters-observability-provider-caps-in-app-counters-and-job-schedulers): Comparison with rate limiters, observability, provider caps, and counters +- [Cycles vs Rate Limiting](https://runcycles.io/concepts/cycles-vs-rate-limiting): Why velocity controls fail for AI agents +- [Cycles vs Guardrails AI](https://runcycles.io/concepts/cycles-vs-guardrails-ai): Budget authority vs content safety — different problems, complementary solutions +- [Cycles vs Provider Spending Caps](https://runcycles.io/concepts/cycles-vs-provider-spending-caps): Why platform limits are not enough for per-run budget control +- [Cycles vs Custom Token Counters](https://runcycles.io/concepts/cycles-vs-custom-token-counters): Build vs buy for agent budget control + +## Blog + +- [How Much Do AI Agents Actually Cost?](https://runcycles.io/blog/how-much-do-ai-agents-cost): Cost breakdown by provider and use case +- [AI Agent Cost Management: The Complete Guide](https://runcycles.io/blog/ai-agent-cost-management-guide): Maturity model from monitoring to enforcement +- [5 Real-World AI Agent Failures That Budget Controls Would Have Prevented](https://runcycles.io/blog/ai-agent-failures-budget-controls-prevent): Concrete failure scenarios with dollar amounts ## Integration Guides