In [1]:
from kamiwaza_client import KamiwazaClient
import openai
from pydantic import BaseModel

In [2]:
client = KamiwazaClient("http://34.230.49.204:7777/api/")
client.serving.list_active_deployments()

[ActiveModelDeployment(id=UUID('9974c1d7-444d-4f78-9bf6-30ecef701d43'), m_id=UUID('c45afd07-17b5-4509-b026-33c43e3b5467'), m_name='Qwen3-32B-AWQ', status='DEPLOYED', instances=[ModelInstance:
 ID: 70b4c6ea-41fa-458a-8e0a-2555c262a654
 Deployment ID: 9974c1d7-444d-4f78-9bf6-30ecef701d43
 Status: DEPLOYED
 Listen Port: 32775], lb_port=51105, endpoint='http://34.230.49.204:51105/v1')]

In [3]:
openai_client = client.openai.get_client('Qwen3-32B-AWQ')

## Define the runtime tool & its JSON schema
We expose a dummy weather function so the demo is entirely self‑contained.


In [5]:
from typing import Dict, Any
import json, time
from pydantic import BaseModel, Field

class WeatherParams(BaseModel):
    city: str = Field(..., description="City name")
    state: str = Field(..., description="US state abbreviation, e.g. CA")
    unit: str = Field(..., description="Either 'celsius' or 'fahrenheit'")

def get_current_weather(city: str, state: str, unit: str) -> Dict[str, Any]:
    """Pretend weather service so we don't need a real API call."""
    fake_temp_f = 100  # you could randomise this or call a real API
    temp = fake_temp_f if unit == "fahrenheit" else round((fake_temp_f - 32) * 5 / 9)
    time.sleep(0.3)  # small pause so the streaming looks lively
    return {
        "temperature": temp,
        "unit": unit,
        "city": city,
        "state": state,
        "description": "clear skies with a light breeze",
    }

weather_tool_schema = {
    "type": "function",
    "function": {
        "name": "get_current_weather",
        "description": "Get the current weather in a given US city",
        "parameters": WeatherParams.schema(),  # Pydantic → JSON Schema
    },
}

tools = [weather_tool_schema]

local_tool_registry = {"get_current_weather": get_current_weather}
print("✅ Tool registered — ready for the chat call")


✅ Tool registered — ready for the chat call


/var/folders/mc/ls40y_m57zz0hw7gjzblhxpc0000gn/T/ipykernel_49232/2537202073.py:28: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  "parameters": WeatherParams.schema(),  # Pydantic → JSON Schema


## Kick off a streaming chat with tool support
The system prompt asks the model to *think in public* using `<thinking>` 
tags, then decide whether to call the weather tool.


In [6]:
messages = [
    {
        "role": "system",
        "content": (
            "You are a helpful assistant. When you receive tool outputs, "
            "always provide a complete and helpful final answer to the user's question. "
            "Be conversational and provide practical advice."
        ),
    },
    {
        "role": "user",
        "content": "should i wear short sleeves or long sleeves in nyc today?",
    },
]

print("\n📡 **Streaming** — watch the tokens, tool call, and final answer arrive\n")

# First API call with tools
response_stream = openai_client.chat.completions.create(
    model="model", 
    messages=messages,
    tools=tools,
    tool_choice="auto",  # Let the model decide when to use tools
    stream=True,
)

# Track tool calls as they stream in
tool_calls = []
current_tool_call = None
accumulated_content = ""

for chunk in response_stream:
    if not chunk.choices:
        continue
    
    choice = chunk.choices[0]
    delta = choice.delta
    
    # Handle reasoning content (if your model supports it)
    if hasattr(delta, "reasoning_content") and delta.reasoning_content:
        print("\033[36m" + delta.reasoning_content + "\033[0m", end="", flush=True)
    
    # Handle regular content
    if delta.content:
        accumulated_content += delta.content
        print(delta.content, end="", flush=True)
    
    # Handle tool calls
    if delta.tool_calls:
        for tc_delta in delta.tool_calls:
            # Start a new tool call
            if tc_delta.index == 0 and not current_tool_call:
                current_tool_call = {
                    "id": tc_delta.id or f"call_{uuid.uuid4().hex[:8]}",
                    "type": "function",
                    "function": {
                        "name": tc_delta.function.name,
                        "arguments": ""
                    }
                }
            
            # Accumulate arguments
            if tc_delta.function.arguments:
                current_tool_call["function"]["arguments"] += tc_delta.function.arguments

# Execute tool calls if any
if choice.finish_reason == "tool_calls" and current_tool_call:
    print(f"\n\n🔧 Model requested tool: {current_tool_call['function']['name']}")
    
    # Parse arguments and execute
    args = json.loads(current_tool_call["function"]["arguments"])
    print(f"   with args: {args}")
    
    tool_name = current_tool_call["function"]["name"]
    tool_result = local_tool_registry[tool_name](**args)
    print(f"🔧 Tool response: {tool_result}")
    
    # Add the assistant's tool call message
    messages.append({
        "role": "assistant",
        "content": accumulated_content if accumulated_content else None,
        "tool_calls": [current_tool_call]
    })
    
    # Add the tool response
    messages.append({
        "role": "tool",
        "tool_call_id": current_tool_call["id"],
        "name": tool_name,
        "content": json.dumps(tool_result)
    })
    
    # Make a follow-up call to get the final answer
    print("\n\n💭 Getting final answer...\n")
    
    follow_stream = openai_client.chat.completions.create(
        model="model",
        messages=messages,
        stream=True,
        # No tools this time - we want a final answer
    )
    
    # Stream the final response
    for follow_chunk in follow_stream:
        if not follow_chunk.choices:
            continue
        
        follow_choice = follow_chunk.choices[0]
        follow_delta = follow_choice.delta
        
        # Handle reasoning content
        if hasattr(follow_delta, "reasoning_content") and follow_delta.reasoning_content:
            print("\033[36m" + follow_delta.reasoning_content + "\033[0m", end="", flush=True)
        
        # Handle final answer content
        if follow_delta.content:
            print(follow_delta.content, end="", flush=True)
    
    print("\n\n🎉 **Done!**")

# If no tool was called, we're done
elif choice.finish_reason == "stop":
    print("\n\n🎉 **Done!** (No tool calls needed)")

2025-06-11 12:15:05,181 - httpx - INFO - HTTP Request: POST http://34.230.49.204:51105/v1/chat/completions "HTTP/1.1 200 OK"



📡 **Streaming** — watch the tokens, tool call, and final answer arrive

[36m
[0m[36mOkay[0m[36m,[0m[36m the[0m[36m user[0m[36m is[0m[36m asking[0m[36m whether[0m[36m they[0m[36m should[0m[36m wear[0m[36m short[0m[36m sleeves[0m[36m or[0m[36m long[0m[36m sleeves[0m[36m in[0m[36m NYC[0m[36m today[0m[36m.[0m[36m To[0m[36m figure[0m[36m this[0m[36m out[0m[36m,[0m[36m I[0m[36m need[0m[36m to[0m[36m know[0m[36m the[0m[36m current[0m[36m weather[0m[36m there[0m[36m.[0m[36m The[0m[36m tools[0m[36m provided[0m[36m include[0m[36m a[0m[36m function[0m[36m called[0m[36m get[0m[36m_current[0m[36m_weather[0m[36m,[0m[36m which[0m[36m requires[0m[36m city[0m[36m,[0m[36m state[0m[36m,[0m[36m and[0m[36m unit[0m[36m parameters[0m[36m.[0m[36m 

[0m[36mFirst[0m[36m,[0m[36m NYC[0m[36m is[0m[36m New[0m[36m York[0m[36m City[0m[36m,[0m[36m so[0m[36m the[0m[36m city[0m[36m para

2025-06-11 12:15:13,823 - httpx - INFO - HTTP Request: POST http://34.230.49.204:51105/v1/chat/completions "HTTP/1.1 200 OK"


🔧 Tool response: {'temperature': 100, 'unit': 'fahrenheit', 'city': 'New York', 'state': 'NY', 'description': 'clear skies with a light breeze'}


💭 Getting final answer...

[36m
[0m[36mOkay[0m[36m,[0m[36m the[0m[36m user[0m[36m is[0m[36m asking[0m[36m whether[0m[36m to[0m[36m wear[0m[36m short[0m[36m sleeves[0m[36m or[0m[36m long[0m[36m sleeves[0m[36m in[0m[36m NYC[0m[36m today[0m[36m.[0m[36m I[0m[36m need[0m[36m to[0m[36m check[0m[36m the[0m[36m weather[0m[36m.[0m[36m I[0m[36m'll[0m[36m use[0m[36m the[0m[36m get[0m[36m_current[0m[36m_weather[0m[36m tool[0m[36m for[0m[36m New[0m[36m York[0m[36m.[0m[36m The[0m[36m response[0m[36m says[0m[36m [0m[36m1[0m[36m0[0m[36m0[0m[36m°F[0m[36m with[0m[36m clear[0m[36m skies[0m[36m and[0m[36m a[0m[36m light[0m[36m breeze[0m[36m.[0m[36m That[0m[36m's[0m[36m pretty[0m[36m hot[0m[36m.[0m[36m At[0m[36m [0m[36m1[0m[36m0[0m[36m0