Skip to content

Streaming token usage shows doubled input tokens vs non-streaming #2770

@gustavo-braga-electric

Description

@gustavo-braga-electric

Initial Checks

Description

Hi everyone! I've discovered a token counting discrepancy in Pydantic AI v0.8.1. It seems that the streaming methods always report exactly 2x the input tokens (At least for anthropic models) compared to non-streaming methods. Below I am attaching a way to reproduce and verify this issue.

from pydantic_ai import Agent, RunContext
from pydantic_ai.messages import PartDeltaEvent, PartStartEvent, TextPartDelta
from pydantic_graph import End
import asyncio
from pydantic_ai.models import anthropic
import logfire

logfire.configure()
logfire.instrument_pydantic_ai() 

agent = Agent(
    'anthropic:claude-sonnet-4-0',
    deps_type=str,
    system_prompt=("You always respond with a 'Hello! Counting tokens!'"),
)

result = agent.run_sync("Hi!")

print("Non-streaming:")
print(result.output)
print(result.usage())

print("-"*100)

async def stream():
    async with agent.iter('Hello!') as agent_run:
        node = agent_run.next_node  

        all_nodes = [node]

        # Drive the iteration manually:
        while not isinstance(node, End):  
                node = await agent_run.next(node)
                if Agent.is_model_request_node(node):
                    async with node.stream(agent_run.ctx) as request_stream:
                        async for event in request_stream:
                            if isinstance(event, PartStartEvent):
                                yield event.part.content
                            if isinstance(event, PartDeltaEvent):
                                if isinstance(event.delta, TextPartDelta):
                                    yield event.delta.content_delta
                all_nodes.append(node)
    print()
    print("Streaming:")
    print(all_nodes)
    print(agent_run.result.output)
    print(agent_run.result.usage())

async def main():
    async for chunk in stream():
        print(chunk, end='', flush=True) # Add a newline after streaming

asyncio.run(main())
The results are the following:
Non-streaming:
Hello! Counting tokens!
RunUsage(input_tokens=22, output_tokens=9, details={'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 22, 'output_tokens': 9}, requests=1)
----------------------------------------------------------------------------------------------------
14:56:42.319 agent run
14:56:42.319   chat claude-sonnet-4-0
Hello! Counting tokens!
Streaming:
[UserPromptNode(user_prompt='Hello!', instructions=None, instructions_functions=[], system_prompts=("You always respond with a 'Hello! Counting tokens!'",), system_prompt_functions=[], system_prompt_dynamic_functions={}), ModelRequestNode(request=ModelRequest(parts=[SystemPromptPart(content="You always respond with a 'Hello! Counting tokens!'", timestamp=datetime.datetime(2025, 9, 2, 17, 56, 42, 319654, tzinfo=datetime.timezone.utc)), UserPromptPart(content='Hello!', timestamp=datetime.datetime(2025, 9, 2, 17, 56, 42, 319657, tzinfo=datetime.timezone.utc))])), CallToolsNode(model_response=ModelResponse(parts=[TextPart(content='Hello! Counting tokens!')], usage=RequestUsage(input_tokens=44, output_tokens=12, details={'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 44, 'output_tokens': 12}), model_name='claude-sonnet-4-0', timestamp=datetime.datetime(2025, 9, 2, 17, 56, 44, 479301, tzinfo=datetime.timezone.utc), provider_name='anthropic')), End(data=FinalResult(output='Hello! Counting tokens!'))]
Hello! Counting tokens!
RunUsage(input_tokens=44, output_tokens=12, details={'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 44, 'output_tokens': 12}, requests=1)

As you can see when not streaming the counting is exactly right, and I can verify that in the anthropic Dashboard as being 22 input tokens and 9 output tokens. But when streaming is enabled I always got 2x the input and the output varies a bit. I think the output has something to do with the number of chunks getting in and pydantic_ai is doing some wrong incrementation there. I tested both methods run_stream() and agent.iter() and both resulted in the same behaviour. I also tested logfire to see if the usage was different there, but it probably get its usage from the same source result.usage() as the error persists there aswell.

Example Code

from pydantic_ai import Agent, RunContext
from pydantic_ai.messages import PartDeltaEvent, PartStartEvent, TextPartDelta
from pydantic_graph import End
import asyncio
from pydantic_ai.models import anthropic
import logfire

logfire.configure()
logfire.instrument_pydantic_ai() 

agent = Agent(
    'anthropic:claude-sonnet-4-0',
    deps_type=str,
    system_prompt=("You always respond with a 'Hello! Counting tokens!'"),
)

result = agent.run_sync("Hi!")

print("Non-streaming:")
print(result.output)
print(result.usage())

print("-"*100)

async def stream():
    async with agent.iter('Hello!') as agent_run:
        node = agent_run.next_node  

        all_nodes = [node]

        # Drive the iteration manually:
        while not isinstance(node, End):  
                node = await agent_run.next(node)
                if Agent.is_model_request_node(node):
                    async with node.stream(agent_run.ctx) as request_stream:
                        async for event in request_stream:
                            if isinstance(event, PartStartEvent):
                                yield event.part.content
                            if isinstance(event, PartDeltaEvent):
                                if isinstance(event.delta, TextPartDelta):
                                    yield event.delta.content_delta
                all_nodes.append(node)
    print()
    print("Streaming:")
    print(all_nodes)
    print(agent_run.result.output)
    print(agent_run.result.usage())

async def main():
    async for chunk in stream():
        print(chunk, end='', flush=True) # Add a newline after streaming

asyncio.run(main())

Python, Pydantic AI & LLM client version

python==3.12.3
pydantic_ai==0.8.1
anthropic==0.65.0

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions