In [1]:
from src.agi.config import OPENROUTER_API_KEY, OPENROUTER_BASE_URL, OR_MODEL, DEFAULT_MAX_PAGES_FETCHED
from src.agi.clients.openrouter_client import OpenRouterClient
from src.agi.clients.brave_client import BraveClient
from src.agi.clients.fetch_client import FetchClient
from src.agi.cache.cache import Cache
from src.agi.extract.html_extract import extract_text
from src.agi.clients.browser_client import PlaywrightFetchClient

# Final output construction
import json
import re
from urllib.parse import urlparse

from bs4 import BeautifulSoup


In [2]:
client = OpenRouterClient()
brave_client = BraveClient()
fetch_client = FetchClient()
seed = PlaywrightFetchClient(headless=False, user_data_dir=".pw_sova")
cache = Cache()

In [15]:

TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "search_web",
            "description": "Search the web for information. Use this first to find relevant URLs. For finding products on a specific site, use 'site:domain.com keywords' format (e.g., 'site:rozetka.ua iPhone 15').",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Search query string. Use 'site:domain.com keywords' to search within a specific website.",
                    },
                    "count": {
                        "type": "integer",
                        "description": "Number of results to return (default: 5, max: 10)",
                        "default": 5,
                        "minimum": 1,
                        "maximum": 10,
                    },
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "fetch_url",
            "description": "Fetch and extract text content from a URL. Use this after search_web to get page content. Use this to verify product pages.",
            "parameters": {
                "type": "object",
                "properties": {
                    "url": {
                        "type": "string",
                        "description": "URL to fetch",
                    },
                },
                "required": ["url"],
            },
        },
    },
]


In [16]:
# Decoupled tool execution functions

def execute_search_web(query: str, count: int = 5):
    """Execute search_web tool."""
    # Check cache
    cached = cache.get_search(query)
    if cached:
        results = cached
    else:
        try:
            results = brave_client.search(query, count=count)
            cache.set_search(query, results)
        except Exception as e:
            print(f"Search failed: {e}")
            return {
                "content": f"Search failed: {e}",
                "success": False,
            }
    
    if not results:
        return {
            "content": "Search returned no results. Try a different query.",
            "success": False,
        }
    
    # Format results
    formatted = "\n".join([
        f"- {r['title']}\n  URL: {r['url']}\n  {r['snippet']}"
        for r in results
    ])
    return {
        "content": f"Found {len(results)} search results:\n\n{formatted}",
        "success": True,
    }


def execute_fetch_url(url: str, fetched_urls: set, max_pages_fetched: int):
    """Execute fetch_url tool - basic HTTP fetch with status check."""
    if not url:
        return {"content": "No URL provided", "success": False}
    
    if len(fetched_urls) >= max_pages_fetched:
        return {
            "content": f"Maximum pages fetched ({max_pages_fetched}). Cannot fetch more.",
            "success": False,
        }
    
    # Check cache
    cached = cache.get_fetch(url)
    if cached:
        fetch_result = cached
    else:
        fetch_result = fetch_client.fetch(url)
        if fetch_result.get("html"):
            # Extract text
            text = extract_text(fetch_result["html"])
            fetch_result["text"] = text
            cache.set_fetch(url, fetch_result)
    
    status = fetch_result.get("status", 0)
    final_url = fetch_result.get("final_url", url)
    
    # Basic status check - only fail on 404 or other errors
    if status == 404:
        return {
            "content": f"Fetched {url} returned 404 (not found).",
            "success": False,
            "url": final_url,
            "status": status,
        }
    
    if status != 200:
        error_msg = fetch_result.get("error", "Unknown error")
        return {
            "content": f"Fetched {url} returned status {status} (error: {error_msg}).",
            "success": False,
            "url": final_url,
            "status": status,
            "error": error_msg,
        }
    
    # Success - page fetched
    fetched_urls.add(final_url)
    text = fetch_result.get("text", extract_text(fetch_result.get("html", "")))
    title = fetch_result.get("title", "")
    extracted_links = fetch_result.get("extracted_links", [])
    canonical_url = fetch_result.get("canonical_url")
    
    # Format content
    content = f"Fetched {url}:\n\n{text}"
    if extracted_links:
        content += f"\n\nFound {len(extracted_links)} links on this page:\n"
        for i, link in enumerate(extracted_links[:10], 1):  # Show first 10
            content += f"{i}. {link}\n"
        if len(extracted_links) > 10:
            content += f"... and {len(extracted_links) - 10} more\n"
    
    return {
        "content": content,
        "success": True,
        "url": final_url,
        "final_url": final_url,
        "canonical_url": canonical_url,
        "title": title,
        "extracted_links": extracted_links,
    }

### Phase 1 product research

In [3]:
url = 'https://sovajewels.com/catalog/koltsa/koltso-iz-belogo-zolota-i-keramiki-smart-beautiful-artikul-110474820202.html'


In [5]:
# impossible to use in jupyter dut to async api of playwright
# res = seed.fetch(url)

# loading rendered .html file:
with open('debug_sova_seed.html', 'r', encoding='utf-8') as f:
    data = f.read()

In [6]:

def strip_html_basic(html: str) -> str:
    """
    Minimal HTML cleaner:
    - removes script/style/noscript/svg/iframe/canvas
    - removes comments
    - returns cleaned HTML string
    """
    if not html:
        return ""

    soup = BeautifulSoup(html, "lxml")

    # remove heavy / irrelevant tags
    for tag in soup(["script", "style", "noscript", "svg", "iframe", "canvas"]):
        tag.decompose()

    # remove HTML comments
    for c in soup.find_all(string=lambda s: isinstance(s, type(soup.comment))):
        c.extract()

    return str(soup)


def html_to_text_basic(html: str, max_chars: int = 12000) -> str:
    """
    Minimal text extraction:
    - strips junk tags
    - uses get_text() to flatten
    - trims length
    """
    cleaned = strip_html_basic(html)
    if not cleaned:
        return ""

    soup = BeautifulSoup(cleaned, "lxml")
    text = soup.get_text(separator="\n", strip=True)

    # collapse excessive newlines
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    text = "\n".join(lines)

    return text[:max_chars]



In [7]:
databasic = strip_html_basic(data)
textbasic = html_to_text_basic(data)

In [8]:
user_prompt = textbasic

messages = [
    {
        "role": "system",
        "content": (
            "You are a product information extractor.\n"
            "You receive plain text extracted from an HTML product page.\n"
            "Your task is to identify ONE main product and extract its key information.\n\n"

            "OUTPUT RULES:\n"
            "- Return ONLY valid JSON.\n"
            "- Do NOT add explanations.\n"
            "- Do NOT add markdown.\n"
            "- Do NOT invent information.\n"
            "- If some field is not present, return an empty string or null.\n\n"

            "REQUIRED JSON FORMAT:\n"
            "{\n"
            '  \"title\": \"\",\n'
            '  \"price\": null,\n'
            '  \"currency\": \"\",\n'
            '  \"characteristics\": [\n'
            '    \"\",\n'
            '    \"\",\n'
            '    \"\"\n'
            "  ]\n"
            "}\n\n"

            "FIELD DEFINITIONS:\n"
            "- title: main product name as shown in the text\n"
            "- price: numeric value of the main product price (no currency symbols)\n"
            "- currency: currency code or symbol (e.g. UAH, USD, EUR, ₴, $, €)\n"
            "- characteristics: 3 to 6 most important technical or descriptive attributes "
            "(material, size, model, color, features, compatibility, etc.)\n"
        ),
    },
    {"role": "user", "content": user_prompt},
]

out = client.chat(messages)

### Phase 2 product research

In [13]:
import json, re

def parse_llm_json(content: str) -> dict:
    m = re.search(r"```json\s*({.*?})\s*```", content, re.DOTALL | re.IGNORECASE)
    if m:
        return json.loads(m.group(1))
    # fallback: try parse whole content
    return json.loads(content)

ref_json = parse_llm_json(out["content"])



In [14]:
target_domain = "zolotiyvik.ua"
user_prompt = json.dumps({
    "target_domain": target_domain,
    "reference": ref_json,   # <-- your phase1 JSON dict
}, ensure_ascii=False)

messages = [
    {
        "role": "system",
        "content": (
            "You are a product search agent.\n"
            "Input: JSON with (target_domain, reference).\n"
            "Tools: search_web (Brave).\n\n"

            "TASK:\n"
            "Using ONLY search_web, find candidate URLs on target_domain that are likely to contain products similar to the reference.\n"
            "Your output for now is discovery only (no page fetching).\n\n"

            "RULES:\n"
            "- Use ONLY URLs returned by search_web.\n"
            "- Always restrict queries to target_domain using: site:TARGET_DOMAIN\n"
            "- Do NOT assume any domain-specific vocabulary.\n"
            "- Generate queries using the reference fields:\n"
            "  - title\n"
            "  - characteristics (as free-text tokens)\n"
            "  - price + currency (use as optional constraints)\n"
            "- Prefer broader queries first, then narrower.\n"
            "- Run 6 to 10 search_web calls maximum.\n"
            "- Collect and deduplicate URLs.\n"
            "- Guess url_type from snippet+path only: one of [\"listing\", \"product\", \"other\", \"unknown\"].\n\n"

            "QUERY STRATEGY:\n"
            "1) Start broad: site:TARGET_DOMAIN + short title keywords (2-4 words)\n"
            "2) Add 1-2 strongest characteristics (keep them as-is; do not normalize)\n"
            "3) If results are too broad, add price as a hint (optional)\n"
            "4) Try language variants only by removing/adding punctuation and using substrings, not translations.\n\n"

            "OUTPUT (RETURN ONLY VALID JSON):\n"
            "{\n"
            "  \"target_domain\": \"\",\n"
            "  \"queries_used\": [\"...\"],\n"
            "  \"candidates\": [\n"
            "    {\n"
            "      \"url\": \"\",\n"
            "      \"title\": \"\",\n"
            "      \"snippet\": \"\",\n"
            "      \"url_type_guess\": \"listing|product|other|unknown\",\n"
            "      \"matched_reference_bits\": [\"...\"],\n"
            "      \"evidence_query\": \"...\"\n"
            "    }\n"
            "  ]\n"
            "}\n"
        ),
    },
    {"role": "user", "content": user_prompt},
]


In [17]:
out = client.chat(messages, tools = TOOLS)

In [18]:
out

{'role': 'assistant',
 'content': '',
 'refusal': None,
 'reasoning': 'The user wants me to search for products similar to a reference on the domain zolotiyvik.ua. The reference is a ring made of white gold and ceramic called "Smart & Beautiful" with price 6664 UAH.\n\nLet me analyze the reference:\n- Title: Кольцо из белого золота и керамики Smart & Beautiful\n- Price: 6664 UAH\n- Characteristics:\n  - Металл: Золото 585 пробы\n  - Цвет металла: Белый\n  - Материал: Керамика\n  - Средний вес: 2.2107 г\n  - Артикул: 110474820201\n\nI need to generate 6-10 search queries using site:zolotiyvik.ua with different combinations of keywords:\n\n1. Broad: site:zolotiyvik.ua кольцо золото керамика\n2. With white gold: site:zolotiyvik.ua белое золото керамика кольцо\n3. With the brand/name: site:zolotiyvik.ua Smart Beautiful кольцо\n4. With article number: site:zolotiyvik.ua 110474820201\n5. White gold ring ceramic: site:zolotiyvik.ua кольцо белое золото керамика 585\n6. With price hint: site:zo

In [None]:
tool_calls = out.get("tool_calls", [])
tool_calls

In [None]:
new_messages = []
url_provenance = {}
sources = []
rejected_urls = []

In [None]:

tool_calls = out.get("tool_calls", [])
for tool_call in tool_calls:
    function_name = tool_call.get("function", {}).get("name", "")
    function_args_str = tool_call.get("function", {}).get("arguments", "{}")

    try:
        import json
        function_args = json.loads(function_args_str)
    except json.JSONDecodeError:
        print(f"Invalid tool call arguments: {function_args_str}")
        continue

    print(f"Tool call: {function_name}({function_args})")

    # Execute tool using decoupled functions
    if function_name == "search_web":
        query = function_args.get("query", "")
        count = function_args.get("count", 5)
        tool_result = execute_search_web(query, count)
 
    new_messages.append({
        "role": "tool",
        "tool_call_id": tool_call.get("id"),
        "content": tool_result["content"],
    })

    # Track sources and rejected URLs
    if "url" in tool_result:
        url = tool_result.get("final_url") or tool_result.get("url")
        
        if tool_result.get("success"):
            # Successful fetch
            title = tool_result.get("title", url)
            
            # Update provenance (simplified)
            if url not in url_provenance:
                url_provenance[url] = {
                    "url": url,
                    "title": title,
                    "final_url": tool_result.get("final_url", url),
                    "canonical_url": tool_result.get("canonical_url"),
                }
            
            # Add to sources if not already present
            if not any(s.get("url") == url for s in sources):
                sources.append({
                    "url": url,
                    "title": title,
                })
        else:
            # Failed fetch - track as rejected
            reason = tool_result.get("content", "Unknown error")
            status = tool_result.get("status", "unknown")
            rejected_urls.append({
                "url": url,
                "reason": f"Status {status}: {reason}",
            })

    # Track debug info
    debug_traces.append({
        "step": step + 1,
        "tool": function_name,
        "args": function_args,
        "result_length": len(tool_result.get("content", "")),
        "success": tool_result.get("success", False),
    })


In [55]:

TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "search_web",
            "description": "Search the web for information. Use this first to find relevant URLs. For finding products on a specific site, use 'site:domain.com keywords' format (e.g., 'site:rozetka.ua iPhone 15').",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Search query string. Use 'site:domain.com keywords' to search within a specific website.",
                    },
                    "count": {
                        "type": "integer",
                        "description": "Number of results to return (default: 5, max: 10)",
                        "default": 5,
                        "minimum": 1,
                        "maximum": 10,
                    },
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "fetch_url",
            "description": "Fetch and extract text content from a URL. Use this after search_web to get page content. Use this to verify product pages.",
            "parameters": {
                "type": "object",
                "properties": {
                    "url": {
                        "type": "string",
                        "description": "URL to fetch",
                    },
                },
                "required": ["url"],
            },
        },
    },
]


In [None]:
user_prompt = "найди 5 максимально похожих товаров на этот: https://sovajewels.com/ua/p/koltso-iz-belogo-zolota-i-keramiki-smart-beautiful-artikul-110474820202/ только на сайте этой кмпании: https://zolotiyvik.ua/ua/"
messages = [
    {
        "role": "system",
        "content": (
            "You are a product research assistant.\n"
            "Tools: fetch_url (HTTP GET), search_web (Brave search).\n\n"

            "PHASE 1 ONLY: Reference product -> MAIN DEFINITIONS\n"
            "Goal: Extract MAIN DEFINITIONS (MIN 2, MAX 10) that best define the product for later matching.\n\n"

            "Algorithm:\n"
            "1) From the user message, extract reference_url.\n"
            "2) Try fetch_url(reference_url).\n"
            "3) If fetch_url fails (success=false OR status!=200), fallback to search_web:\n"
            "   - run search_web using reference_url as query\n"
            "   - also run search_web using tokens from the URL slug/title (if obvious)\n"
            "   - from search results, pick up to 2 URLs that likely describe the same product and try fetch_url on them\n"
            "4) Build product definitions from the best available source(s) in this priority order:\n"
            "   - successful fetch_url text\n"
            "   - search_web snippets\n"
            "5) MAIN DEFINITIONS must be specific/actionable attributes (avoid vague terms).\n"
            "   Examples: color, size/dimensions, capacity,price_range, material, key feature, compatibility, form-factor.\n"
            "6) If something is inferred (not explicitly stated), mark it as uncertain.\n\n"

            "GLOBAL RULES:\n"
            "- Do NOT invent URLs.\n"
            "- Only use URLs from the user input or returned by search_web.\n"
            "- Always include evidence_urls (exact URLs used).\n\n"

            "FINAL OUTPUT: Return ONLY valid JSON (no markdown, no extra text).\n"
            "{\n"
            '  "phase": "phase_1_reference_definitions",\n'
            '  "reference_url": "",\n'
            '  "fetch_attempts": [\n'
            '    {"url": "", "success": false, "status": null}\n'
            "  ],\n"
            '  "source": "fetch_url|search_web|mixed",\n'
            '  "product": {\n'
            '    "title": "",\n'
            '    "product_type": "",\n'
            '    "brand": "",\n'
            '    "model": "",\n'
            '    "main_definitions": [\n'
            '      {"name": "", "value": "", "source": "fetch|search", "uncertain": false}\n'
            "    ],\n"
            '    "keywords": []\n'
            "  },\n"
            '  "evidence_urls": []\n'
            "}\n"
        ),
    },
    {"role": "user", "content": user_prompt},
]


In [62]:
# Decoupled tool execution functions

def execute_search_web(query: str, count: int = 5):
    """Execute search_web tool."""
    # Check cache
    cached = cache.get_search(query)
    if cached:
        results = cached
    else:
        try:
            results = brave_client.search(query, count=count)
            cache.set_search(query, results)
        except Exception as e:
            print(f"Search failed: {e}")
            return {
                "content": f"Search failed: {e}",
                "success": False,
            }
    
    if not results:
        return {
            "content": "Search returned no results. Try a different query.",
            "success": False,
        }
    
    # Format results
    formatted = "\n".join([
        f"- {r['title']}\n  URL: {r['url']}\n  {r['snippet']}"
        for r in results
    ])
    return {
        "content": f"Found {len(results)} search results:\n\n{formatted}",
        "success": True,
    }


def execute_fetch_url(url: str, fetched_urls: set, max_pages_fetched: int):
    """Execute fetch_url tool - basic HTTP fetch with status check."""
    if not url:
        return {"content": "No URL provided", "success": False}
    
    if len(fetched_urls) >= max_pages_fetched:
        return {
            "content": f"Maximum pages fetched ({max_pages_fetched}). Cannot fetch more.",
            "success": False,
        }
    
    # Check cache
    cached = cache.get_fetch(url)
    if cached:
        fetch_result = cached
    else:
        fetch_result = fetch_client.fetch(url)
        if fetch_result.get("html"):
            # Extract text
            text = extract_text(fetch_result["html"])
            fetch_result["text"] = text
            cache.set_fetch(url, fetch_result)
    
    status = fetch_result.get("status", 0)
    final_url = fetch_result.get("final_url", url)
    
    # Basic status check - only fail on 404 or other errors
    if status == 404:
        return {
            "content": f"Fetched {url} returned 404 (not found).",
            "success": False,
            "url": final_url,
            "status": status,
        }
    
    if status != 200:
        error_msg = fetch_result.get("error", "Unknown error")
        return {
            "content": f"Fetched {url} returned status {status} (error: {error_msg}).",
            "success": False,
            "url": final_url,
            "status": status,
            "error": error_msg,
        }
    
    # Success - page fetched
    fetched_urls.add(final_url)
    text = fetch_result.get("text", extract_text(fetch_result.get("html", "")))
    title = fetch_result.get("title", "")
    extracted_links = fetch_result.get("extracted_links", [])
    canonical_url = fetch_result.get("canonical_url")
    
    # Format content
    content = f"Fetched {url}:\n\n{text}"
    if extracted_links:
        content += f"\n\nFound {len(extracted_links)} links on this page:\n"
        for i, link in enumerate(extracted_links[:10], 1):  # Show first 10
            content += f"{i}. {link}\n"
        if len(extracted_links) > 10:
            content += f"... and {len(extracted_links) - 10} more\n"
    
    return {
        "content": content,
        "success": True,
        "url": final_url,
        "final_url": final_url,
        "canonical_url": canonical_url,
        "title": title,
        "extracted_links": extracted_links,
    }

In [None]:
import json
import re

from typing import Any, Dict, List, Optional
def extract_json_from_text(text: str) -> str:
    """Extract a JSON object from model output (handles markdown code blocks)."""
    if not text:
        return ""

    # Prefer fenced JSON blocks
    m = re.search(r"```(?:json)?\s*({[\s\S]*?})\s*```", text, re.IGNORECASE)
    if m:
        return m.group(1).strip()

    # Otherwise try to capture the first top-level JSON object.
    # This is safer than greedy {.*} when the output contains braces in text.
    start = text.find("{")
    if start == -1:
        return text.strip()

    # Scan to find matching closing brace for the first object
    depth = 0
    in_str = False
    esc = False
    for i in range(start, len(text)):
        ch = text[i]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
        else:
            if ch == '"':
                in_str = True
            elif ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    return text[start:i + 1].strip()

    # Fallback: return from first '{' to end
    return text[start:].strip()


def _extract_first_url(user_prompt: str) -> Optional[str]:
    m = re.search(r"https?://[^\s]+", user_prompt or "")
    return m.group(0) if m else None


def construct_final_output(
    answer: str,
    user_prompt: str,
    sources: List[Dict[str, Any]],
    rejected_urls: List[Dict[str, Any]],
) -> Dict[str, Any]:
    """Construct Phase-1 final output JSON: phase_1_reference_definitions."""
    parsed: Optional[Dict[str, Any]] = None

    # Parse model output as JSON
    try:
        json_text = extract_json_from_text(answer)
        parsed = json.loads(json_text) if json_text else None
        if not isinstance(parsed, dict):
            parsed = None
    except Exception as e:
        print(f"Could not parse LLM output as JSON: {e}")
        parsed = None

    # Extract reference_url from user prompt as a fallback
    reference_url = _extract_first_url(user_prompt)

    # Collect evidence urls from tool calls
    tool_urls = [s.get("url") for s in (sources or []) if s.get("url")]
    tool_urls = [u for u in tool_urls if isinstance(u, str) and u.strip()]

    # Defaults for phase 1
    default_output: Dict[str, Any] = {
        "phase": "phase_1_reference_definitions",
        "reference_url": reference_url or "",
        "fetch_attempts": [],
        "source": "mixed" if tool_urls else "search_web",
        "product": {
            "title": "",
            "product_type": "",
            "brand": "",
            "model": "",
            "main_definitions": [],
            "keywords": [],
        },
        "evidence_urls": [],
        # Optional passthrough for your pipeline (even if unused now)
        "rejected": rejected_urls or [],
    }

    if not parsed:
        # No valid model JSON -> return defaults + tool evidence
        evidence = list(dict.fromkeys(default_output["evidence_urls"] + tool_urls))
        default_output["evidence_urls"] = evidence
        return default_output

    # Start from defaults, then merge parsed fields cautiously
    out = default_output

    # Phase
    if isinstance(parsed.get("phase"), str) and parsed["phase"].strip():
        out["phase"] = parsed["phase"].strip()

    # reference_url
    if isinstance(parsed.get("reference_url"), str) and parsed["reference_url"].strip():
        out["reference_url"] = parsed["reference_url"].strip()

    # fetch_attempts
    if isinstance(parsed.get("fetch_attempts"), list):
        out["fetch_attempts"] = parsed["fetch_attempts"]

    # source
    if isinstance(parsed.get("source"), str) and parsed["source"].strip():
        out["source"] = parsed["source"].strip()

    # product
    if isinstance(parsed.get("product"), dict):
        p = parsed["product"]
        out_p = out["product"]

        for k in ["title", "product_type", "brand", "model"]:
            if isinstance(p.get(k), str):
                out_p[k] = p[k]

        if isinstance(p.get("keywords"), list):
            out_p["keywords"] = p["keywords"]

        if isinstance(p.get("main_definitions"), list):
            out_p["main_definitions"] = p["main_definitions"]

        out["product"] = out_p

    # evidence_urls: merge parsed + tool urls (dedup, keep order)
    parsed_evidence = parsed.get("evidence_urls")
    if not isinstance(parsed_evidence, list):
        parsed_evidence = []
    merged = list(dict.fromkeys([u for u in parsed_evidence if isinstance(u, str)] + tool_urls))
    out["evidence_urls"] = merged

    # Keep rejected passthrough
    out["rejected"] = rejected_urls or parsed.get("rejected", []) or []

    # Ensure reference_url is present
    if not out.get("reference_url") and reference_url:
        out["reference_url"] = reference_url

    return out


In [68]:
max_steps = 10
fetched_urls = set()
sources = []
debug_traces = []
url_provenance = {}
rejected_urls = []  # Track URLs that failed (404, errors, etc.)

In [None]:
answer = ""  # Initialize answer variable

for step in range(max_steps):
    print(f"Agent step {step + 1}/{max_steps}")

    # Call LLM
    try:
        assistant_message = client.chat(messages, tools=TOOLS)
    except Exception as e:
        print(f"OpenRouter call failed: {e}")
        break

    # Check if we have a final answer
    if "tool_calls" not in assistant_message or not assistant_message["tool_calls"]:
        # Final answer
        answer = assistant_message.get("content", "")
        print(f"Agent completed with final answer ({len(answer)} chars)")
        break
        
    # Process tool calls
    messages.append(assistant_message)

    tool_calls = assistant_message.get("tool_calls", [])
    for tool_call in tool_calls:
        function_name = tool_call.get("function", {}).get("name", "")
        function_args_str = tool_call.get("function", {}).get("arguments", "{}")

        try:
            import json
            function_args = json.loads(function_args_str)
        except json.JSONDecodeError:
            print(f"Invalid tool call arguments: {function_args_str}")
            continue

        print(f"Tool call: {function_name}({function_args})")

        # Execute tool using decoupled functions
        if function_name == "search_web":
            query = function_args.get("query", "")
            count = function_args.get("count", 5)
            tool_result = execute_search_web(query, count)
        elif function_name == "fetch_url":
            url = function_args.get("url", "")
            tool_result = execute_fetch_url(url, fetched_urls, DEFAULT_MAX_PAGES_FETCHED)
        else:
            tool_result = {
                "content": f"Unknown tool: {function_name}",
                "success": False,
            }

        # Add tool result to messages
        messages.append({
            "role": "tool",
            "tool_call_id": tool_call.get("id"),
            "content": tool_result["content"],
        })

        # Track sources and rejected URLs
        if "url" in tool_result:
            url = tool_result.get("final_url") or tool_result.get("url")
            
            if tool_result.get("success"):
                # Successful fetch
                title = tool_result.get("title", url)
                
                # Update provenance (simplified)
                if url not in url_provenance:
                    url_provenance[url] = {
                        "url": url,
                        "title": title,
                        "final_url": tool_result.get("final_url", url),
                        "canonical_url": tool_result.get("canonical_url"),
                    }
                
                # Add to sources if not already present
                if not any(s.get("url") == url for s in sources):
                    sources.append({
                        "url": url,
                        "title": title,
                    })
            else:
                # Failed fetch - track as rejected
                reason = tool_result.get("content", "Unknown error")
                status = tool_result.get("status", "unknown")
                rejected_urls.append({
                    "url": url,
                    "reason": f"Status {status}: {reason}",
                })

        # Track debug info
        debug_traces.append({
            "step": step + 1,
            "tool": function_name,
            "args": function_args,
            "result_length": len(tool_result.get("content", "")),
            "success": tool_result.get("success", False),
        })



# Construct final output
final_output = construct_final_output(answer, user_prompt, sources, rejected_urls)

# Pretty print the final output
print("\n" + "="*80)
print("FINAL OUTPUT:")
print("="*80)
print(json.dumps(final_output, indent=2, ensure_ascii=False))
print("="*80)


In [70]:
query = "site:sovajewels.com кольцо белое золото керамика Smart Beautiful 110474820202"
out = brave_client.search(query, count=count)

In [71]:
out

[{'title': 'Кольцо из белого золота и керамики Smart & Beautiful. Артикул: 110474820202 купить в интернет-магазине l SOVA Jewels',
  'url': 'https://sovajewels.com/catalog/koltsa/koltso-iz-belogo-zolota-i-keramiki-smart-beautiful-artikul-110474820202.html',
  'snippet': 'Купить Кольцо из белого золота и керамики Smart &amp; Beautiful. Артикул: 110474820202 по лучшей цене в интернет-магазине SOVA Jewels ⚜️Собственное производство✈️ Возможность быстрой доставки за 2-6 часов'},
 {'title': 'Купить Кольцо из белого золота и керамики Smart & Beautiful. Артикул: 110474820202 Киев | SOVA Jewels',
  'url': 'https://sovajewels.com/catalog/koltsa/koltso-iz-belogo-zolota-i-keramiki-smart-beautiful-artikul-110474820202.html?OFFER_ID=28418',
  'snippet': 'Если настроение диктует надеть решительный look прямо с подиума, Кольцо из белого золота и керамики Smart &amp; Beautiful. Артикул: 110474820202 - разумное и красивое дополнение.'},
 {'title': 'Кольцо из керамики с белым золотом Smart & Beautiful. 

In [None]:
# Final output construction
import json
import re
from urllib.parse import urlparse

def extract_json_from_text(text: str):
    """Try to extract JSON from text (handles markdown code blocks, etc.)."""
    # Try to find JSON in code blocks
    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
    if json_match:
        return json_match.group(1)
    
    # Try to find JSON object directly
    json_match = re.search(r'\{.*\}', text, re.DOTALL)
    if json_match:
        return json_match.group(0)
    
    return text

def construct_final_output(answer: str, user_prompt: str, sources: list, rejected_urls: list):
    """Construct the final output JSON structure."""
    # Try to parse LLM's answer as JSON
    parsed_output = None
    try:
        json_text = extract_json_from_text(answer)
        parsed_output = json.loads(json_text)
    except (json.JSONDecodeError, AttributeError) as e:
        print(f"Could not parse LLM output as JSON: {e}")
        parsed_output = None
    
    # Extract URLs from user prompt (first is reference, second is target)
    all_urls = re.findall(r'https?://[^\s]+', user_prompt)
    reference_url = all_urls[0] if len(all_urls) > 0 else None
    target_url = all_urls[1] if len(all_urls) > 1 else None
    
    # Extract target domain from target URL
    target_domain = None
    if target_url:
        domain_match = re.search(r'https?://([^/\s]+)', target_url)
        if domain_match:
            target_domain = domain_match.group(1)
    
    # Build final output structure
    final_output = {
        "phase": "phase_1_2_definitions_to_categories",
        "reference": {
            "reference_url": reference_url,
            "attributes": parsed_output.get("reference", {}).get("attributes", {}) if parsed_output else {
                "product_type": "",
                "brand": "",
                "model": "",
                "main_definitions": [],
                "keywords": [],
                "price_hint": None,
            },
            "uncertainty_notes": parsed_output.get("reference", {}).get("uncertainty_notes", {}) if parsed_output else {
                "product_type": "",
                "brand": "",
                "model": "",
                "main_definitions": "",
                "keywords": "",
                "price_hint": "",
            },
            "evidence_urls": [s["url"] for s in sources if s.get("url")],
        },
        "target_site": {
            "domain": target_domain or "",
            "language_prefix_hint": "",
        },
        "definition_groups": parsed_output.get("definition_groups", []) if parsed_output else [],
        "rejected": rejected_urls,
    }
    
    # If we have parsed output, merge it (but keep our evidence_urls and rejected)
    if parsed_output:
        if "reference" in parsed_output:
            final_output["reference"].update(parsed_output["reference"])
            # Ensure evidence_urls includes all sources
            evidence_set = set(final_output["reference"].get("evidence_urls", []))
            evidence_set.update([s["url"] for s in sources if s.get("url")])
            final_output["reference"]["evidence_urls"] = list(evidence_set)
        
        if "target_site" in parsed_output:
            final_output["target_site"].update(parsed_output["target_site"])
            if not final_output["target_site"].get("domain") and target_domain:
                final_output["target_site"]["domain"] = target_domain
    
    return final_output

# Construct final output (only if we have an answer)
if answer or sources:
    final_output = construct_final_output(answer, user_prompt, sources, rejected_urls)
    
    # Pretty print the final output
    print("\n" + "="*80)
    print("FINAL OUTPUT:")
    print("="*80)
    print(json.dumps(final_output, indent=2, ensure_ascii=False))
    print("="*80)
else:
    print("\nNo final output - agent did not complete or produce results.")

In [None]:
print(messages[-1]['content'])