diff --git a/src/strands/models/litellm.py b/src/strands/models/litellm.py index 17f1bbb94..1f1e999d2 100644 --- a/src/strands/models/litellm.py +++ b/src/strands/models/litellm.py @@ -222,12 +222,11 @@ def format_chunk(self, event: dict[str, Any], **kwargs: Any) -> StreamEvent: # Only LiteLLM over Anthropic supports cache write tokens # Waiting until a more general approach is available to set cacheWriteInputTokens - if tokens_details := getattr(event["data"], "prompt_tokens_details", None): if cached := getattr(tokens_details, "cached_tokens", None): usage_data["cacheReadInputTokens"] = cached - if creation := getattr(tokens_details, "cache_creation_tokens", None): - usage_data["cacheWriteInputTokens"] = creation + if creation := getattr(event["data"], "cache_creation_input_tokens", None): + usage_data["cacheWriteInputTokens"] = creation return StreamEvent( metadata=MetadataEvent( diff --git a/tests/strands/models/test_litellm.py b/tests/strands/models/test_litellm.py index aafee1d17..832b5c836 100644 --- a/tests/strands/models/test_litellm.py +++ b/tests/strands/models/test_litellm.py @@ -193,7 +193,7 @@ async def test_stream(litellm_acompletion, api_key, model_id, model, agenerator, mock_event_8 = unittest.mock.Mock(choices=[unittest.mock.Mock(finish_reason="tool_calls", delta=mock_delta_8)]) mock_event_9 = unittest.mock.Mock() mock_event_9.usage.prompt_tokens_details.cached_tokens = 10 - mock_event_9.usage.prompt_tokens_details.cache_creation_tokens = 10 + mock_event_9.usage.cache_creation_input_tokens = 10 litellm_acompletion.side_effect = unittest.mock.AsyncMock( return_value=agenerator( @@ -255,7 +255,7 @@ async def test_stream(litellm_acompletion, api_key, model_id, model, agenerator, "metadata": { "usage": { "cacheReadInputTokens": mock_event_9.usage.prompt_tokens_details.cached_tokens, - "cacheWriteInputTokens": mock_event_9.usage.prompt_tokens_details.cache_creation_tokens, + "cacheWriteInputTokens": mock_event_9.usage.cache_creation_input_tokens, "inputTokens": mock_event_9.usage.prompt_tokens, "outputTokens": mock_event_9.usage.completion_tokens, "totalTokens": mock_event_9.usage.total_tokens, diff --git a/tests_integ/models/test_model_litellm.py b/tests_integ/models/test_model_litellm.py index f177c08a4..d72937641 100644 --- a/tests_integ/models/test_model_litellm.py +++ b/tests_integ/models/test_model_litellm.py @@ -1,4 +1,5 @@ import unittest.mock +from uuid import uuid4 import pydantic import pytest @@ -220,7 +221,7 @@ async def test_cache_read_tokens_multi_turn(model): system_prompt_content: list[SystemContentBlock] = [ # Caching only works when prompts are large - {"text": "You are a helpful assistant. Always be concise." * 200}, + {"text": f"You are helpful assistant No. {uuid4()} Always be concise." * 200}, {"cachePoint": {"type": "default"}}, ]