-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Description
Initial Checks
- I confirm that I'm using the latest version of Pydantic AI
- I confirm that I searched for my issue in https://github.com/pydantic/pydantic-ai/issues before opening this issue
Description
Hi everyone! I've discovered a token counting discrepancy in Pydantic AI v0.8.1. It seems that the streaming methods always report exactly 2x the input tokens (At least for anthropic models) compared to non-streaming methods. Below I am attaching a way to reproduce and verify this issue.
from pydantic_ai import Agent, RunContext
from pydantic_ai.messages import PartDeltaEvent, PartStartEvent, TextPartDelta
from pydantic_graph import End
import asyncio
from pydantic_ai.models import anthropic
import logfire
logfire.configure()
logfire.instrument_pydantic_ai()
agent = Agent(
'anthropic:claude-sonnet-4-0',
deps_type=str,
system_prompt=("You always respond with a 'Hello! Counting tokens!'"),
)
result = agent.run_sync("Hi!")
print("Non-streaming:")
print(result.output)
print(result.usage())
print("-"*100)
async def stream():
async with agent.iter('Hello!') as agent_run:
node = agent_run.next_node
all_nodes = [node]
# Drive the iteration manually:
while not isinstance(node, End):
node = await agent_run.next(node)
if Agent.is_model_request_node(node):
async with node.stream(agent_run.ctx) as request_stream:
async for event in request_stream:
if isinstance(event, PartStartEvent):
yield event.part.content
if isinstance(event, PartDeltaEvent):
if isinstance(event.delta, TextPartDelta):
yield event.delta.content_delta
all_nodes.append(node)
print()
print("Streaming:")
print(all_nodes)
print(agent_run.result.output)
print(agent_run.result.usage())
async def main():
async for chunk in stream():
print(chunk, end='', flush=True) # Add a newline after streaming
asyncio.run(main())
The results are the following:
Non-streaming:
Hello! Counting tokens!
RunUsage(input_tokens=22, output_tokens=9, details={'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 22, 'output_tokens': 9}, requests=1)
----------------------------------------------------------------------------------------------------
14:56:42.319 agent run
14:56:42.319 chat claude-sonnet-4-0
Hello! Counting tokens!
Streaming:
[UserPromptNode(user_prompt='Hello!', instructions=None, instructions_functions=[], system_prompts=("You always respond with a 'Hello! Counting tokens!'",), system_prompt_functions=[], system_prompt_dynamic_functions={}), ModelRequestNode(request=ModelRequest(parts=[SystemPromptPart(content="You always respond with a 'Hello! Counting tokens!'", timestamp=datetime.datetime(2025, 9, 2, 17, 56, 42, 319654, tzinfo=datetime.timezone.utc)), UserPromptPart(content='Hello!', timestamp=datetime.datetime(2025, 9, 2, 17, 56, 42, 319657, tzinfo=datetime.timezone.utc))])), CallToolsNode(model_response=ModelResponse(parts=[TextPart(content='Hello! Counting tokens!')], usage=RequestUsage(input_tokens=44, output_tokens=12, details={'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 44, 'output_tokens': 12}), model_name='claude-sonnet-4-0', timestamp=datetime.datetime(2025, 9, 2, 17, 56, 44, 479301, tzinfo=datetime.timezone.utc), provider_name='anthropic')), End(data=FinalResult(output='Hello! Counting tokens!'))]
Hello! Counting tokens!
RunUsage(input_tokens=44, output_tokens=12, details={'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 44, 'output_tokens': 12}, requests=1)
As you can see when not streaming the counting is exactly right, and I can verify that in the anthropic Dashboard as being 22 input tokens and 9 output tokens. But when streaming is enabled I always got 2x the input and the output varies a bit. I think the output has something to do with the number of chunks getting in and pydantic_ai is doing some wrong incrementation there. I tested both methods run_stream() and agent.iter() and both resulted in the same behaviour. I also tested logfire to see if the usage was different there, but it probably get its usage from the same source result.usage() as the error persists there aswell.
Example Code
from pydantic_ai import Agent, RunContext
from pydantic_ai.messages import PartDeltaEvent, PartStartEvent, TextPartDelta
from pydantic_graph import End
import asyncio
from pydantic_ai.models import anthropic
import logfire
logfire.configure()
logfire.instrument_pydantic_ai()
agent = Agent(
'anthropic:claude-sonnet-4-0',
deps_type=str,
system_prompt=("You always respond with a 'Hello! Counting tokens!'"),
)
result = agent.run_sync("Hi!")
print("Non-streaming:")
print(result.output)
print(result.usage())
print("-"*100)
async def stream():
async with agent.iter('Hello!') as agent_run:
node = agent_run.next_node
all_nodes = [node]
# Drive the iteration manually:
while not isinstance(node, End):
node = await agent_run.next(node)
if Agent.is_model_request_node(node):
async with node.stream(agent_run.ctx) as request_stream:
async for event in request_stream:
if isinstance(event, PartStartEvent):
yield event.part.content
if isinstance(event, PartDeltaEvent):
if isinstance(event.delta, TextPartDelta):
yield event.delta.content_delta
all_nodes.append(node)
print()
print("Streaming:")
print(all_nodes)
print(agent_run.result.output)
print(agent_run.result.usage())
async def main():
async for chunk in stream():
print(chunk, end='', flush=True) # Add a newline after streaming
asyncio.run(main())
Python, Pydantic AI & LLM client version
python==3.12.3
pydantic_ai==0.8.1
anthropic==0.65.0