In [114]:
from openai import OpenAI

openai_client = OpenAI()

In [115]:
import requests
from urllib.parse import quote_plus

def search_wikipedia(query: str):
    url = (
        "https://en.wikipedia.org/w/api.php"
        f"?action=query&format=json&list=search&srsearch={quote_plus(query)}"
    )

    r = requests.get(
        url,
        timeout=10,
        headers={"User-Agent": "my-agent/1.0 (contact: you@example.com)"}
    )

    # Fail loudly with useful info
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code}: {r.text[:200]}")

    data = r.json()

    if "query" not in data or "search" not in data["query"]:
        raise RuntimeError(f"Unexpected response shape: {data}")

    return data["query"]["search"]


# Test
if __name__ == "__main__":
    results = search_wikipedia("capybara")
    # print("Top result:", results[0]["title"] if results else "No results")
    # print("Count:", len(results[:]))
    # print(results["title"])
    titles = [r["title"] for r in results[:]]
    print(titles)



['Capybara', 'Lesser capybara', 'Capybara (disambiguation)', 'Capybara Games', 'Hydrochoerus', 'Caviidae', 'Flow (2024 film)', 'Capybara (software)', 'Yuzu bath', 'Kerodon']


In [116]:
results = search_wikipedia("capybara")

capybara_title_results = [
    r for r in results
    if "capybara" in r["title"].lower()
]

count = len(capybara_title_results)

print("Count:", count)
print("Titles:", [r["title"] for r in capybara_title_results])


Count: 5
Titles: ['Capybara', 'Lesser capybara', 'Capybara (disambiguation)', 'Capybara Games', 'Capybara (software)']


In [117]:
# # import requests
# # from urllib.parse import quote_plus

# def get_wikipedia(page_title: str) -> dict:
#     """
#     Fetch raw Wikipedia page content (wikitext) via:
#     https://en.wikipedia.org/w/index.php?title=PAGE_TITLE&action=raw
#     """
#     if not isinstance(page_title, str) or not page_title.strip():
#         raise ValueError("page_title must be a non-empty string")

#     # Browser-style encoding: spaces -> '+'
#     encoded_title = quote_plus(page_title)

#     url = f"https://en.wikipedia.org/w/index.php?title={encoded_title}&action=raw"

#     r = requests.get(
#         url,
#         timeout=10,
#         headers={"User-Agent": "my-agent/1.0 (contact: you@example.com)"}
#     )

#     if r.status_code != 200:
#         raise RuntimeError(f"HTTP {r.status_code}: {r.text[:200]}")

#     # Raw endpoint returns plain text (wikitext), not JSON
#     return {
#         "title": page_title,
#         "url": url,
#         "content": r.text
#     }


# # Test
# if __name__ == "__main__":
#     doc = get_wikipedia("Capybara")
#     print("URL:", doc["url"])
#     print("First 500 chars:\n", doc["content"][:500])


In [118]:
import requests
from dataclasses import dataclass
from urllib.parse import quote_plus

@dataclass
class WikipediaPageDocument:
    title: str
    url: str
    raw_content: str

    def parse(self) -> dict:
        return {
            "content": self.raw_content,   # <-- key name gitsource expects
            "metadata": {
                "source": "wikipedia",
                "title": self.title,
                "url": self.url,
            },
        }

class WikipediaPageDataReader:
    def __init__(self, user_agent: str = "my-agent/1.0 (contact: you@example.com)"):
        self.user_agent = user_agent

    def read(self, page_titles: list[str]) -> list[WikipediaPageDocument]:
        docs = []
        for title in page_titles:
            encoded = quote_plus(title)
            url = f"https://en.wikipedia.org/w/index.php?title={encoded}&action=raw"
            r = requests.get(url, timeout=10, headers={"User-Agent": self.user_agent})
            r.raise_for_status()
            docs.append(WikipediaPageDocument(title=title, url=url, raw_content=r.text))
        return docs


In [119]:
from gitsource import chunk_documents

reader = WikipediaPageDataReader()
pages = reader.read(["Capybara", "Lesser capybara"])


parsed_docs = [doc.parse() for doc in pages]
chunked_docs = chunk_documents(parsed_docs, size=3000, step=1500)


In [120]:
from minsearch import AppendableIndex

In [121]:
index = AppendableIndex(
    text_fields=["title", "description", "content"],
    keyword_fields=["filename"]
)
index.fit(chunked_docs)

<minsearch.append.AppendableIndex at 0x773428bbef00>

In [122]:
def search(query):
    results = index.search(
        query=query,
        num_results=5
    )
    return results

In [123]:
import json

RAG_INSTRUCTIONS = """
You're a documentation assistant. Answer the QUESTION based on the CONTEXT from our documentation.

Use only facts from the CONTEXT when answering.
If the answer isn't in the CONTEXT, say so.
"""

RAG_PROMPT_TEMPLATE = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    return RAG_PROMPT_TEMPLATE.format(
        question=question,
        context=context
    )

In [124]:
# question = "How do I create a dahsbord in Evidently?"
# question = "What is this page about? https://en.wikipedia.org/wiki/Capybara"
question = "How many characters are in this page? https://en.wikipedia.org/wiki/Capybara"
search_results = search(question)
user_prompt = build_prompt(question, search_results)

In [125]:
messages = [
    {"role": "system", "content": RAG_INSTRUCTIONS},
    {"role": "user", "content": user_prompt}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=messages,
)

In [126]:
print(response.output_text)

The CONTEXT does not provide information about the number of characters on the Wikipedia page for the capybara.


In [127]:
instructions = """
You're a documentation assistant. 

Answer the user question using the documentation knowledge base

Use only facts from the knowledge base when answering.
IMPORTANT: f you cannot find the answer, inform the user.
"""

search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the documentation database for relevant results based on a query string.",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "The search query to look up in the index"
            }
        },
        "required": [
            "query"
        ]
    }
}


In [128]:
messages = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=messages,
)

response.usage.input_tokens

69

In [129]:
messages = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=messages,
    tools=[search_tool],
)

response.usage.input_tokens

116

In [130]:
tool_call = response.output[0]
tool_call

ResponseFunctionToolCall(arguments='{"query":"Capybara Wikipedia page character count"}', call_id='call_REWwecqLNMT20te5FnnVib3r', name='search', type='function_call', id='fc_09163d802011ab5e00698f5be9b408819bab140d3986d7df44', status='completed')

In [131]:
messages.append(tool_call)

In [132]:
tool_call.arguments


'{"query":"Capybara Wikipedia page character count"}'

In [133]:
arguments = json.loads(tool_call.arguments)
arguments

{'query': 'Capybara Wikipedia page character count'}

In [134]:
search_results = search(query='create dashboard in Evidently')

In [135]:
search_results = search(**arguments)
search_results[:1]

[{'start': 22500,
  'content': 'chee-GWEE-reh), got a boost in the 18th century when the local clergy asked the Vatican to give capybara the status of fish. (...) It remains more popular in Venezuela\'s rural interior than in the capital.}}</ref> There is widespread perception in Venezuela that consumption of capybaras is exclusive to rural people.<ref name="NYTimes2007"/>\n\nIn August 2021, Argentine and international media reported that capybaras had been disturbing residents of [[Nordelta]], an affluent gated community north of [[Buenos Aires]] built atop the local capybara\'s preexisting wetland habitat. This inspired social media users to jokingly adopt the capybara as a symbol of [[class struggle]] and [[communism]].<ref name="TheGuardian_20210822">{{cite web|title=Attack of the giant rodents or class war? Argentina\'s rich riled by new neighbors|newspaper=[[The Guardian]] |date=22 August 2021 |url=https://www.theguardian.com/world/2021/aug/22/argentina-capybaras-giant-rodents-ga

In [136]:
call_output = {
    "type": "function_call_output",
    "call_id": tool_call.call_id,
    "output": json.dumps(search_results),
}

In [137]:
messages.append(call_output)
messages

[{'role': 'system',
  'content': "\nYou're a documentation assistant. \n\nAnswer the user question using the documentation knowledge base\n\nUse only facts from the knowledge base when answering.\nIMPORTANT: f you cannot find the answer, inform the user.\n"},
 {'role': 'user',
  'content': 'How many characters are in this page? https://en.wikipedia.org/wiki/Capybara'},
 ResponseFunctionToolCall(arguments='{"query":"Capybara Wikipedia page character count"}', call_id='call_REWwecqLNMT20te5FnnVib3r', name='search', type='function_call', id='fc_09163d802011ab5e00698f5be9b408819bab140d3986d7df44', status='completed'),
 {'type': 'function_call_output',
  'call_id': 'call_REWwecqLNMT20te5FnnVib3r',
  'output': '[{"start": 22500, "content": "chee-GWEE-reh), got a boost in the 18th century when the local clergy asked the Vatican to give capybara the status of fish. (...) It remains more popular in Venezuela\'s rural interior than in the capital.}}</ref> There is widespread perception in Venezu

In [138]:
response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=messages,
    tools=[search_tool],
)

response.usage.input_tokens

4965

In [139]:
print(response.output_text)

I currently do not have access to real-time data, including the character count of web pages like the Wikipedia page on capybaras. You can find the character count by copying the text from the page into a word processor or using a character count tool online.


In [140]:
from typing import Literal
from pydantic import BaseModel, Field


class RAGResponse(BaseModel):
    """
    This model provides a structured answer with metadata about the response,
    including confidence, categorization, and follow-up suggestions.
    """

    answer: str = Field(description="The main answer to the user's question in markdown")
    found_answer: bool = Field(description="True if relevant information was found in the documentation")
    confidence: float = Field(description="Confidence score from 0.0 to 1.0 indicating how certain the answer is")
    confidence_explanation: str = Field(description="Explanation about the confidence level")
    answer_type: Literal["how-to", "explanation", "troubleshooting", "comparison", "reference"] = Field(description="The category of the answer")
    followup_questions: list[str] = Field(description="Suggested follow-up questions the user might want to ask")


In [141]:
response = openai_client.responses.parse(
    model='gpt-4o-mini',
    input=messages,
    tools=[search_tool],
    text_format=RAGResponse
)

response.usage.input_tokens

5189

In [142]:
4299 - 4075

224

In [143]:
rag_response = response.output_parsed
print(rag_response.answer)

I couldn't find specific information about the character count on the Wikipedia page for Capybara. You may want to check the page directly or utilize online tools that can analyze text from web pages.
