In [1]:
import json
import os
from datetime import date
from dotenv import load_dotenv
from llama_stack_client import LlamaStackClient
from time import sleep

load_dotenv()

LLAMA_STACK_URL = os.getenv("LLAMA_STACK_URL", "http://127.0.0.1:8321")
MODEL_ID = "vllm/openai/gpt-oss-20b"
MODEL_LABEL = "GPT-OSS-20b"
VLLM_VERSION = "0.11.2+rhai5"
RUN_DATE = os.getenv("RUN_DATE", date.today().isoformat())

MAX_TOOL_CALLS = 20
MAX_INFER_ITERS = 20

# Use llama-stack client (supports max_infer_iters parameter, unlike OpenAI client)
client = LlamaStackClient(base_url=LLAMA_STACK_URL)

# Note: Using llama-stack client to pass max_infer_iters directly (not supported by OpenAI client).
# max_infer_iters controls the total number of iterations (tool calls + final message).
# max_tool_calls limits the number of tool calls within those iterations.

PROMPTS = [
    "What were the top 3 headlines about NVIDIA in the last 7 days? Include sources.",
    "What is the latest stable release of vLLM and its release date? Cite sources.",
    "What is the most recent U.S. CPI (inflation) release and its month-over-month value? Cite sources."
]


def normalize_result(result):
    if isinstance(result, dict):
        return result
    if hasattr(result, "model_dump"):
        return result.model_dump()
    if hasattr(result, "__dict__"):
        return result.__dict__
    return {"value": str(result)}


def print_search_results(item):
    results = getattr(item, "results", None)
    if results is None:
        return
    print("    Results:")
    for idx, result in enumerate(results, 1):
        data = normalize_result(result)
        title = data.get("title")
        url = data.get("url")
        snippet = data.get("snippet") or data.get("content")
        print(f"      {idx}. {title}")
        if url:
            print(f"         URL: {url}")
        if snippet:
            print(f"         Snippet: {snippet}")


def run_prompt(prompt):
    print(f"\nUser: {prompt}")
    response = client.responses.create(
        model=MODEL_ID,
        tools=[{"type": "web_search"}],
        input=prompt,
        max_tool_calls=MAX_TOOL_CALLS,
        max_infer_iters=MAX_INFER_ITERS
    )

    print(f"Response ID: {response.id}")
    print(f"Response status: {response.status}")
    
    search_triggered = False
    tool_call_count = 0
    message_count = 0

    for item in response.output:
        item_type = getattr(item, "type", "unknown")
        item_status = getattr(item, "status", "unknown")
        print(f"  - Type: {item_type}, Status: {item_status}")
        if "search" in item_type:
            search_triggered = True
            tool_call_count += 1
        elif item_type == "message":
            message_count += 1
        query = getattr(item, "query", None)
        if query:
            print(f"    Query: {query}")
        print_search_results(item)

    print("\nDiagnostics:")
    print(f"  Search triggered: {search_triggered}")
    print(f"  Tool calls: {tool_call_count}")
    print(f"  Messages: {message_count}")
    
    print(f"  Total iterations: {tool_call_count + message_count}")
    
    # Check for max_infer_iters issue
    if response.status == "incomplete" and tool_call_count >= MAX_TOOL_CALLS and message_count == 0:
        print(f"  ⚠️  WARNING: Likely hit max_tool_calls limit ({MAX_TOOL_CALLS})")
        print(f"     Model used {tool_call_count} tool calls, leaving no iterations for final message")
    elif response.status == "incomplete" and (tool_call_count + message_count) >= MAX_INFER_ITERS:
        print(f"  ⚠️  WARNING: Likely hit max_infer_iters limit ({MAX_INFER_ITERS})")
        print(f"     Total iterations: {tool_call_count + message_count}")
    
    print("\n" + "=" * 60)
    print(f"Response from {MODEL_LABEL} with vLLM {VLLM_VERSION} ({RUN_DATE})")
    print("=" * 60)
    if response.output_text:
        print(response.output_text)
    else:
        print("(empty response)")
    return response


try:
    for prompt in PROMPTS:
        run_prompt(prompt)
        sleep(10)  # Brief pause between prompts
finally:
    # llama-stack client doesn't need explicit close, but keeping for consistency
    pass



User: What were the top 3 headlines about NVIDIA in the last 7 days? Include sources.


INFO:httpx:HTTP Request: POST http://127.0.0.1:8321/v1/responses "HTTP/1.1 200 OK"


Response ID: resp_ab3e44ed-5626-45dd-813f-2e99cb028713
Response status: completed
  - Type: web_search_call, Status: completed
  - Type: web_search_call, Status: completed
  - Type: web_search_call, Status: completed
  - Type: web_search_call, Status: completed
  - Type: web_search_call, Status: completed
  - Type: message, Status: completed

Diagnostics:
  Search triggered: True
  Tool calls: 5
  Messages: 1
  Total iterations: 6

Response from GPT-OSS-20b with vLLM 0.11.2+rhai5 (2026-02-02)
(empty response)



User: What is the latest stable release of vLLM and its release date? Cite sources.


INFO:httpx:HTTP Request: POST http://127.0.0.1:8321/v1/responses "HTTP/1.1 200 OK"


Response ID: resp_dbf3f093-bd33-47c8-8b2e-0d4c00744204
Response status: completed
  - Type: web_search_call, Status: completed
  - Type: web_search_call, Status: failed
  - Type: web_search_call, Status: failed
  - Type: message, Status: completed

Diagnostics:
  Search triggered: True
  Tool calls: 3
  Messages: 1
  Total iterations: 4

Response from GPT-OSS-20b with vLLM 0.11.2+rhai5 (2026-02-02)
(empty response)



User: What is the most recent U.S. CPI (inflation) release and its month-over-month value? Cite sources.


INFO:httpx:HTTP Request: POST http://127.0.0.1:8321/v1/responses "HTTP/1.1 200 OK"


Response ID: resp_481eb5c7-84e0-4f36-8115-f035e59dd2cc
Response status: completed
  - Type: web_search_call, Status: completed
  - Type: message, Status: completed

Diagnostics:
  Search triggered: True
  Tool calls: 1
  Messages: 1
  Total iterations: 2

Response from GPT-OSS-20b with vLLM 0.11.2+rhai5 (2026-02-02)
(empty response)


# Run Output (2026-01-31)

## Prompt 1

**Timestamp:** 2026-01-31 14:57:07

**Prompt:** What were the top 3 headlines about NVIDIA in the last 7 days? Include sources.

**Search triggered:** True

**Search calls completed:** 4

**Search calls failed:** 0

**Response status:** completed

**Tool calls:** 4

**Messages:** 1

**Response ID:** resp_b528f809-e273-4e9d-9b6d-2d9b3b9a794e

**Response text:**

**Top 3 NVIDIA headlines (Jan 24‑31 2026)**  

| Date | Headline | Source |
|------|----------|--------|
| **January 31 2026** | “Nvidia CEO Huang denies he is unhappy with OpenAI, says ‘huge’ investment planned” | Reuters – [link](https://www.reuters.com/world/asia-pacific/nvidia-ceo-huang-denies-he-is-unhappy-with-openai-says-huge-investment-planned-2026-01-31/) |
| **January 31 2026** | “Nvidia’s plan to invest up to $100 billion in OpenAI has stalled, WSJ reports” | Reuters – [link](https://www.reuters.com/business/nvidias-plan-invest-up-100-billion-openai-has-stalled-wsj-reports-2026-01-31/) |
| **January 28 2026** | “Exclusive: China gives nod to ByteDance, Alibaba and Tencent to buy Nvidia’s H200 chips – sources” | Reuters – [link](https://www.reuters.com/world/china/china-gives-green-light-importing-first-batch-nvidias-h200-ai-chips-sources-say-2026-01-28/) |

These three stories were among the most widely cited and discussed in the news cycle during the past week, covering Nvidia’s strategic partnership talks with OpenAI, the status of the planned $100 billion investment, and a key regulatory milestone in China granting import permission for Nvidia’s H200 AI chips.

---

## Prompt 2

**Timestamp:** 2026-01-31 14:57:07

**Prompt:** What is the latest stable release of vLLM and its release date? Cite sources.

**Search triggered:** True

**Search calls completed:** 2

**Search calls failed:** 1

**Response status:** completed

**Tool calls:** 3

**Messages:** 1

**Response ID:** resp_05d63d0b-e892-4a7c-93a9-07d4d573b936

**Response text:**

(empty response)

---

## Prompt 3

**Timestamp:** 2026-01-31 14:57:07

**Prompt:** What is the most recent U.S. CPI (inflation) release and its month-over-month value? Cite sources.

**Search triggered:** True

**Search calls completed:** 5

**Search calls failed:** 2

**Response status:** completed

**Tool calls:** 7

**Messages:** 1

**Response ID:** resp_5f876832-eda3-4c61-8b93-650bc6253041

**Response text:**

(empty response)

---


## Full Response JSON (with metadata)

### Prompt 1

**Timestamp:** 2026-01-31 14:57:07

```json
{
  "id": "resp_b528f809-e273-4e9d-9b6d-2d9b3b9a794e",
  "created_at": 1769889298,
  "model": "vllm/openai/gpt-oss-20b",
  "output": [
    {
      "id": "fc_ceb3fa27-2249-4ef7-9f27-f835ec5a6f30",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_23193bdd-ddcf-43f6-b187-8c389e7dca6d",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_45bf2237-2245-4b5a-bf2d-5288a34b6d1d",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_7201029b-f2d8-4b45-8f0d-21d60a72ead2",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "content": [
        {
          "text": "**Top 3 NVIDIA headlines (Jan\u202f24\u201131\u202f2026)**  \n\n| Date | Headline | Source |\n|------|----------|--------|\n| **January\u202f31\u202f2026** | \u201cNvidia CEO Huang denies he is unhappy with OpenAI, says \u2018huge\u2019 investment planned\u201d | Reuters \u2013 [link](https://www.reuters.com/world/asia-pacific/nvidia-ceo-huang-denies-he-is-unhappy-with-openai-says-huge-investment-planned-2026-01-31/) |\n| **January\u202f31\u202f2026** | \u201cNvidia\u2019s plan to invest up to $100\u202fbillion in OpenAI has stalled, WSJ reports\u201d | Reuters \u2013 [link](https://www.reuters.com/business/nvidias-plan-invest-up-100-billion-openai-has-stalled-wsj-reports-2026-01-31/) |\n| **January\u202f28\u202f2026** | \u201cExclusive: China gives nod to ByteDance, Alibaba and Tencent to buy Nvidia\u2019s H200 chips \u2013 sources\u201d | Reuters \u2013 [link](https://www.reuters.com/world/china/china-gives-green-light-importing-first-batch-nvidias-h200-ai-chips-sources-say-2026-01-28/) |\n\nThese three stories were among the most widely cited and discussed in the news cycle during the past week, covering Nvidia\u2019s strategic partnership talks with OpenAI, the status of the planned $100\u202fbillion investment, and a key regulatory milestone in China granting import permission for Nvidia\u2019s H200 AI chips.",
          "annotations": [],
          "logprobs": null,
          "type": "output_text"
        }
      ],
      "role": "assistant",
      "id": "msg_f35504cd-6b4f-499d-a10c-c439df7e505f",
      "status": "completed",
      "type": "message"
    }
  ],
  "status": "completed",
  "error": null,
  "instructions": null,
  "max_tool_calls": 20,
  "metadata": null,
  "object": "response",
  "parallel_tool_calls": true,
  "previous_response_id": null,
  "prompt": null,
  "temperature": null,
  "text": {
    "format": {
      "description": null,
      "name": null,
      "schema_": null,
      "strict": null,
      "type": "text"
    }
  },
  "tool_choice": null,
  "tools": {
    "value": "[ToolOpenAIResponseInputToolWebSearch(search_context_size='medium', type='web_search')]"
  },
  "top_p": null,
  "truncation": null,
  "usage": {
    "input_tokens": 8245,
    "output_tokens": 1052,
    "total_tokens": 9297,
    "input_tokens_details": {
      "cached_tokens": 0
    },
    "output_tokens_details": {
      "reasoning_tokens": 0
    }
  }
}
```

### Prompt 2

**Timestamp:** 2026-01-31 14:57:07

```json
{
  "id": "resp_05d63d0b-e892-4a7c-93a9-07d4d573b936",
  "created_at": 1769889332,
  "model": "vllm/openai/gpt-oss-20b",
  "output": [
    {
      "id": "fc_c1169479-3903-4535-812b-fc09aec99dc8",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_66472e5c-5b82-417f-adac-16ef98dcef2b",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_300627c4-257e-42c9-96b0-4918b0aef0ad",
      "status": "failed",
      "type": "web_search_call"
    },
    {
      "content": [
        {
          "text": "",
          "annotations": [],
          "logprobs": null,
          "type": "output_text"
        }
      ],
      "role": "assistant",
      "id": "msg_f0701c28-8bd0-4b02-ac13-b67a657f65c4",
      "status": "completed",
      "type": "message"
    }
  ],
  "status": "completed",
  "error": null,
  "instructions": null,
  "max_tool_calls": 20,
  "metadata": null,
  "object": "response",
  "parallel_tool_calls": true,
  "previous_response_id": null,
  "prompt": null,
  "temperature": null,
  "text": {
    "format": {
      "description": null,
      "name": null,
      "schema_": null,
      "strict": null,
      "type": "text"
    }
  },
  "tool_choice": null,
  "tools": {
    "value": "[ToolOpenAIResponseInputToolWebSearch(search_context_size='medium', type='web_search')]"
  },
  "top_p": null,
  "truncation": null,
  "usage": {
    "input_tokens": 3004,
    "output_tokens": 233,
    "total_tokens": 3237,
    "input_tokens_details": {
      "cached_tokens": 0
    },
    "output_tokens_details": {
      "reasoning_tokens": 0
    }
  }
}
```

### Prompt 3

**Timestamp:** 2026-01-31 14:57:07

```json
{
  "id": "resp_5f876832-eda3-4c61-8b93-650bc6253041",
  "created_at": 1769889349,
  "model": "vllm/openai/gpt-oss-20b",
  "output": [
    {
      "id": "fc_54db00d4-4a33-421c-a12a-01ba0529325b",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_8d07abdd-0322-46b5-85ea-a155cedc734d",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_128946d7-12ad-469f-b0dc-f4a0338b0d88",
      "status": "failed",
      "type": "web_search_call"
    },
    {
      "id": "fc_9aff7a5c-d379-4f44-b825-182aeeb741e5",
      "status": "failed",
      "type": "web_search_call"
    },
    {
      "id": "fc_63017ee9-a978-48fc-8913-fc283b145bf4",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_4bd43fd5-781d-423d-95fc-c631830e7546",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "id": "fc_05289039-3bdb-4495-804b-858336c58e45",
      "status": "completed",
      "type": "web_search_call"
    },
    {
      "content": [
        {
          "text": "",
          "annotations": [],
          "logprobs": null,
          "type": "output_text"
        }
      ],
      "role": "assistant",
      "id": "msg_24769db1-3dca-4955-a55f-04095a209bf8",
      "status": "completed",
      "type": "message"
    }
  ],
  "status": "completed",
  "error": null,
  "instructions": null,
  "max_tool_calls": 20,
  "metadata": null,
  "object": "response",
  "parallel_tool_calls": true,
  "previous_response_id": null,
  "prompt": null,
  "temperature": null,
  "text": {
    "format": {
      "description": null,
      "name": null,
      "schema_": null,
      "strict": null,
      "type": "text"
    }
  },
  "tool_choice": null,
  "tools": {
    "value": "[ToolOpenAIResponseInputToolWebSearch(search_context_size='medium', type='web_search')]"
  },
  "top_p": null,
  "truncation": null,
  "usage": {
    "input_tokens": 9049,
    "output_tokens": 1136,
    "total_tokens": 10185,
    "input_tokens_details": {
      "cached_tokens": 0
    },
    "output_tokens_details": {
      "reasoning_tokens": 0
    }
  }
}
```
