# LiteLLM Metadata Exploration

This notebook shows what metadata we get from litellm calls and what's available to save in the database.

Focus: Understanding usage/cost data and other metadata from both streaming and non-streaming responses.

In [1]:
import json
import litellm
from dotenv import load_dotenv

load_dotenv()

# Simple test messages
messages = [
    {"role": "user", "content": "What is 2+2? Be concise."},
]

## Non-Streaming Response

Shows all available metadata from a standard completion call.

In [None]:
# Non-streaming call (similar to reasoning_agent.py step generation)
response = await litellm.acompletion(
    model="gpt-4o-mini",
    messages=messages,
    temperature=0.0,
)

print("=" * 60)
print("NON-STREAMING RESPONSE")
print("=" * 60)
print(f"\nResponse type: {type(response)}")
print("\nResponse object attributes:")
for attr in dir(response):
    if not attr.startswith('_'):
        print(f"  - {attr}")

print("\n" + "=" * 60)
print("KEY METADATA FIELDS")
print("=" * 60)

# ID and timestamps
print(f"\nID: {response.id}")
print(f"Created: {response.created}")
print(f"Model: {response.model}")

# Usage information
print(f"\nUsage object: {response.usage}")
if response.usage:
    print(f"  Prompt tokens: {response.usage.prompt_tokens}")
    print(f"  Completion tokens: {response.usage.completion_tokens}")
    print(f"  Total tokens: {response.usage.total_tokens}")

    # Cost information (if available from litellm)
    usage_dict = response.usage.model_dump() if hasattr(response.usage, 'model_dump') else \
        response.usage.__dict__
    print(f"\n  Full usage dict: {json.dumps(usage_dict, indent=2, default=str)}")

# Response content
print(f"\nChoices: {len(response.choices)}")
if response.choices:
    choice = response.choices[0]
    print(f"  Finish reason: {choice.finish_reason}")
    print(f"  Message role: {choice.message.role}")
    print(f"  Message content: {choice.message.content}")

# Hidden metadata (litellm-specific)
print("\n" + "=" * 60)
print("LITELLM INTERNAL METADATA")
print("=" * 60)
if hasattr(response, '_hidden_params'):
    print(f"Hidden params: {json.dumps(response._hidden_params, indent=2, default=str)}")

# Full response as dict
print("\n" + "=" * 60)
print("FULL RESPONSE DICT")
print("=" * 60)
response_dict = response.model_dump() if hasattr(response, 'model_dump') else response.__dict__
print(json.dumps(response_dict, indent=2, default=str))

NON-STREAMING RESPONSE

Response type: <class 'litellm.types.utils.ModelResponse'>

Response object attributes:
  - choices
  - construct
  - copy
  - created
  - dict
  - from_orm
  - get
  - id
  - json
  - model
  - model_computed_fields
  - model_config
  - model_construct
  - model_copy
  - model_dump
  - model_dump_json
  - model_extra
  - model_fields
  - model_fields_set
  - model_json_schema
  - model_parametrized_name
  - model_post_init
  - model_rebuild
  - model_validate
  - model_validate_json
  - model_validate_strings
  - object
  - parse_file
  - parse_obj
  - parse_raw
  - schema
  - schema_json
  - system_fingerprint
  - to_dict
  - to_json
  - update_forward_refs
  - validate

KEY METADATA FIELDS

ID: chatcmpl-CZ4B8pVMaxDZmVBx5PByTiuirXvZr
Created: 1762473578
Model: gpt-4o-mini-2024-07-18

Usage object: Usage(completion_tokens=8, prompt_tokens=17, total_tokens=25, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, 

## Streaming Response

Shows what metadata is available in streaming chunks (used in passthrough.py and final synthesis).

In [None]:
# Streaming call (similar to passthrough.py)
stream = await litellm.acompletion(
    model="gpt-4o-mini",
    messages=messages,
    temperature=0.0,
    stream=True,
    stream_options={"include_usage": True},  # Request usage in final chunk
)

print("=" * 60)
print("STREAMING RESPONSE CHUNKS")
print("=" * 60)

chunk_count = 0
accumulated_usage = None

async for chunk in stream:
    chunk_count += 1
    print(f"\nChunk {chunk_count}:")
    print(f"  Type: {type(chunk)}")
    print(f"  ID: {chunk.id}")
    print(f"  Model: {chunk.model}")
    print(f"  Created: {chunk.created}")

    # Check for usage (only in final chunk with stream_options)
    chunk_usage = getattr(chunk, 'usage', None)
    if chunk_usage:
        print("  ✅ USAGE DATA FOUND:")
        print(f"     Prompt tokens: {chunk_usage.prompt_tokens}")
        print(f"     Completion tokens: {chunk_usage.completion_tokens}")
        print(f"     Total tokens: {chunk_usage.total_tokens}")
        accumulated_usage = chunk_usage

        # Check for cost data
        usage_dict = chunk_usage.model_dump() if hasattr(chunk_usage, 'model_dump') \
            else chunk_usage.__dict__
        print(f"     Full usage dict: {json.dumps(usage_dict, indent=2, default=str)}")

    # Delta content
    if chunk.choices:
        delta = chunk.choices[0].delta
        if hasattr(delta, 'content') and delta.content:
            print(f"  Content: {delta.content!r}")
        if hasattr(delta, 'role') and delta.role:
            print(f"  Role: {delta.role}")
        print(f"  Finish reason: {chunk.choices[0].finish_reason}")

print("\n" + "=" * 60)
print("STREAMING SUMMARY")
print("=" * 60)
print(f"Total chunks: {chunk_count}")
print(f"Usage data received: {'✅ Yes' if accumulated_usage else '❌ No'}")
if accumulated_usage:
    print(f"  Total tokens: {accumulated_usage.total_tokens}")

STREAMING RESPONSE CHUNKS

Chunk 1:
  Type: <class 'litellm.types.utils.ModelResponseStream'>
  ID: chatcmpl-CZ4BAGcJBmqHwRDceEe69SGLy26ju
  Model: gpt-4o-mini
  Created: 1762473580
  Content: '2'
  Role: assistant
  Finish reason: None

Chunk 2:
  Type: <class 'litellm.types.utils.ModelResponseStream'>
  ID: chatcmpl-CZ4BAGcJBmqHwRDceEe69SGLy26ju
  Model: gpt-4o-mini
  Created: 1762473580
  Content: ' +'
  Finish reason: None

Chunk 3:
  Type: <class 'litellm.types.utils.ModelResponseStream'>
  ID: chatcmpl-CZ4BAGcJBmqHwRDceEe69SGLy26ju
  Model: gpt-4o-mini
  Created: 1762473580
  Content: ' '
  Finish reason: None

Chunk 4:
  Type: <class 'litellm.types.utils.ModelResponseStream'>
  ID: chatcmpl-CZ4BAGcJBmqHwRDceEe69SGLy26ju
  Model: gpt-4o-mini
  Created: 1762473580
  Content: '2'
  Finish reason: None

Chunk 5:
  Type: <class 'litellm.types.utils.ModelResponseStream'>
  ID: chatcmpl-CZ4BAGcJBmqHwRDceEe69SGLy26ju
  Model: gpt-4o-mini
  Created: 1762473580
  Content: ' ='
  Finish re

## Cost Calculation

LiteLLM can track costs using `litellm.completion_cost()` function.

In [5]:
# Cost calculation using litellm helper
response = await litellm.acompletion(
    model="gpt-4o-mini",
    messages=messages,
    temperature=0.0,
)

print("=" * 60)
print("COST CALCULATION")
print("=" * 60)

# Calculate cost using litellm
try:
    cost = litellm.completion_cost(completion_response=response)
    print(f"\nTotal cost: ${cost:.6f}")

    # Calculate per-token costs
    if response.usage:
        prompt_cost = (cost / response.usage.total_tokens) * response.usage.prompt_tokens
        completion_cost = (cost / response.usage.total_tokens) * response.usage.completion_tokens
        print(f"Prompt cost: ${prompt_cost:.6f}")
        print(f"Completion cost: ${completion_cost:.6f}")
except Exception as e:
    print(f"Cost calculation error: {e}")

# Alternative: Access cost from response if litellm adds it
response_dict = response.model_dump() if hasattr(response, 'model_dump') else response.__dict__
if '_hidden_params' in response_dict:
    hidden = response_dict['_hidden_params']
    if 'response_cost' in hidden:
        print(f"\nCost from _hidden_params: ${hidden['response_cost']:.6f}")

COST CALCULATION

Total cost: $0.000007
Prompt cost: $0.000005
Completion cost: $0.000002


## Recommended Metadata to Store

Based on what's available from litellm, here's what we should consider storing in the `messages.metadata` JSONB field:

```python
{
    # Token usage
    "prompt_tokens": 10,
    "completion_tokens": 5,
    "total_tokens": 15,
    
    # Costs (calculated via litellm.completion_cost)
    "prompt_cost": 0.000015,
    "completion_cost": 0.000075,
    "total_cost": 0.000090,
    
    # Model info
    "model": "gpt-4o-mini",
    "completion_id": "chatcmpl-...",
    
    # Request parameters
    "temperature": 0.2,
    "max_tokens": 1000,
    
    # Timing
    "created_at": 1699564800,
    "finish_reason": "stop",
    
    # Routing (for debugging)
    "routing_path": "passthrough",  # or "reasoning", "orchestration"
    "session_id": "...",
}
```

---

In [3]:
from litellm import token_counter

messages = [{"user": "role", "content": "Hey, how's it going"}]
print(token_counter(model="gpt-4o-mini", messages=messages))
print(token_counter(model="claude-haiku-4-5", messages=messages))
print(token_counter(model="claude-haiku-4-5", text="Hey, how's it going"))

13
13
6


In [12]:
import litellm
import json
# Get max input tokens for a specific model
print(litellm.get_model_info("gpt-4o-mini")['max_input_tokens'])
print(litellm.get_model_info("claude-haiku-4-5")['max_input_tokens'])
print(json.dumps(litellm.get_model_info("gpt-4o-mini"), indent=2))

128000
200000
{
  "key": "gpt-4o-mini",
  "max_tokens": 16384,
  "max_input_tokens": 128000,
  "max_output_tokens": 16384,
  "input_cost_per_token": 1.5e-07,
  "input_cost_per_token_flex": null,
  "input_cost_per_token_priority": 2.5e-07,
  "cache_creation_input_token_cost": null,
  "cache_creation_input_token_cost_above_200k_tokens": null,
  "cache_read_input_token_cost": 7.5e-08,
  "cache_read_input_token_cost_above_200k_tokens": null,
  "cache_read_input_token_cost_flex": null,
  "cache_read_input_token_cost_priority": 1.25e-07,
  "cache_creation_input_token_cost_above_1hr": null,
  "input_cost_per_character": null,
  "input_cost_per_token_above_128k_tokens": null,
  "input_cost_per_token_above_200k_tokens": null,
  "input_cost_per_query": null,
  "input_cost_per_second": null,
  "input_cost_per_audio_token": null,
  "input_cost_per_token_batches": 7.5e-08,
  "output_cost_per_token_batches": 3e-07,
  "output_cost_per_token": 6e-07,
  "output_cost_per_token_flex": null,
  "output_cos