In [40]:
from openai import OpenAI

openai_client = OpenAI()

In [74]:
import requests
from urllib.parse import quote_plus

def search_wikipedia(query: str):
    url = (
        "https://en.wikipedia.org/w/api.php"
        f"?action=query&format=json&list=search&srsearch={quote_plus(query)}"
    )

    r = requests.get(
        url,
        timeout=10,
        headers={"User-Agent": "my-agent/1.0 (contact: you@example.com)"}
    )

    # Fail loudly with useful info
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code}: {r.text[:200]}")

    data = r.json()

    if "query" not in data or "search" not in data["query"]:
        raise RuntimeError(f"Unexpected response shape: {data}")

    return data["query"]["search"]


# Test
if __name__ == "__main__":
    results = search_wikipedia("capybara")
    # print("Top result:", results[0]["title"] if results else "No results")
    # print("Count:", len(results[:]))
    # print(results["title"])
    titles = [r["title"] for r in results[:]]
    print(titles)



['Capybara', 'Lesser capybara', 'Capybara (disambiguation)', 'Capybara Games', 'Hydrochoerus', 'Capybara (software)', 'Caviidae', 'Flow (2024 film)', 'Yuzu bath', 'Kerodon']


In [75]:
results = search_wikipedia("capybara")

capybara_title_results = [
    r for r in results
    if "capybara" in r["title"].lower()
]

count = len(capybara_title_results)

print("Count:", count)
print("Titles:", [r["title"] for r in capybara_title_results])


Count: 5
Titles: ['Capybara', 'Capybara (disambiguation)', 'Capybara Games', 'Lesser capybara', 'Capybara (software)']


In [42]:
# # import requests
# # from urllib.parse import quote_plus

# def get_wikipedia(page_title: str) -> dict:
#     """
#     Fetch raw Wikipedia page content (wikitext) via:
#     https://en.wikipedia.org/w/index.php?title=PAGE_TITLE&action=raw
#     """
#     if not isinstance(page_title, str) or not page_title.strip():
#         raise ValueError("page_title must be a non-empty string")

#     # Browser-style encoding: spaces -> '+'
#     encoded_title = quote_plus(page_title)

#     url = f"https://en.wikipedia.org/w/index.php?title={encoded_title}&action=raw"

#     r = requests.get(
#         url,
#         timeout=10,
#         headers={"User-Agent": "my-agent/1.0 (contact: you@example.com)"}
#     )

#     if r.status_code != 200:
#         raise RuntimeError(f"HTTP {r.status_code}: {r.text[:200]}")

#     # Raw endpoint returns plain text (wikitext), not JSON
#     return {
#         "title": page_title,
#         "url": url,
#         "content": r.text
#     }


# # Test
# if __name__ == "__main__":
#     doc = get_wikipedia("Capybara")
#     print("URL:", doc["url"])
#     print("First 500 chars:\n", doc["content"][:500])


In [43]:
import requests
from dataclasses import dataclass
from urllib.parse import quote_plus

@dataclass
class WikipediaPageDocument:
    title: str
    url: str
    raw_content: str

    def parse(self) -> dict:
        return {
            "content": self.raw_content,   # <-- key name gitsource expects
            "metadata": {
                "source": "wikipedia",
                "title": self.title,
                "url": self.url,
            },
        }

class WikipediaPageDataReader:
    def __init__(self, user_agent: str = "my-agent/1.0 (contact: you@example.com)"):
        self.user_agent = user_agent

    def read(self, page_titles: list[str]) -> list[WikipediaPageDocument]:
        docs = []
        for title in page_titles:
            encoded = quote_plus(title)
            url = f"https://en.wikipedia.org/w/index.php?title={encoded}&action=raw"
            r = requests.get(url, timeout=10, headers={"User-Agent": self.user_agent})
            r.raise_for_status()
            docs.append(WikipediaPageDocument(title=title, url=url, raw_content=r.text))
        return docs


In [44]:
from gitsource import chunk_documents

reader = WikipediaPageDataReader()
pages = reader.read(["Capybara", "Lesser capybara"])

parsed_docs = [doc.parse() for doc in pages]
chunked_docs = chunk_documents(parsed_docs, size=3000, step=1500)


In [46]:
from minsearch import AppendableIndex

In [47]:
index = AppendableIndex(
    text_fields=["title", "description", "content"],
    keyword_fields=["filename"]
)
index.fit(chunked_docs)

<minsearch.append.AppendableIndex at 0x773458236e40>

In [48]:
def search(query):
    results = index.search(
        query=query,
        num_results=5
    )
    return results

In [49]:
import json

RAG_INSTRUCTIONS = """
You're a documentation assistant. Answer the QUESTION based on the CONTEXT from our documentation.

Use only facts from the CONTEXT when answering.
If the answer isn't in the CONTEXT, say so.
"""

RAG_PROMPT_TEMPLATE = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    return RAG_PROMPT_TEMPLATE.format(
        question=question,
        context=context
    )

In [50]:
# question = "How do I create a dahsbord in Evidently?"
question = "What is this page about? https://en.wikipedia.org/wiki/Capybara"
search_results = search(question)
user_prompt = build_prompt(question, search_results)

In [51]:
messages = [
    {"role": "system", "content": RAG_INSTRUCTIONS},
    {"role": "user", "content": user_prompt}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=messages,
)

In [52]:
print(response.output_text)

The page about capybaras on Wikipedia provides comprehensive information on several aspects of these animals, including their characteristics, habitat, diet, physical descriptions, adaptations, role in popular culture, and their significance in social contexts. It discusses how capybaras are featured in internet memes, their interactions with humans, and their farming for meat and skins in South America. It also touches on their reputation as symbols of calm and their influence in various cultural and social narratives.


In [53]:
instructions = """
You're a documentation assistant. 

Answer the user question using the documentation knowledge base

Use only facts from the knowledge base when answering.
IMPORTANT: f you cannot find the answer, inform the user.
"""

search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the documentation database for relevant results based on a query string.",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "The search query to look up in the index"
            }
        },
        "required": [
            "query"
        ]
    }
}


In [54]:
messages = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=messages,
)

response.usage.input_tokens

67

In [55]:
messages = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=messages,
    tools=[search_tool],
)

response.usage.input_tokens

114

In [56]:
tool_call = response.output[0]
tool_call

ResponseFunctionToolCall(arguments='{"query":"Capybara site:en.wikipedia.org"}', call_id='call_Q0eqREjHYQ9LyVSOO33tNCqg', name='search', type='function_call', id='fc_045211626be0c69b00698f555282a481969ebef012ac10add2', status='completed')

In [57]:
messages.append(tool_call)

In [58]:
tool_call.arguments


'{"query":"Capybara site:en.wikipedia.org"}'

In [59]:
arguments = json.loads(tool_call.arguments)
arguments

{'query': 'Capybara site:en.wikipedia.org'}

In [60]:
search_results = search(query='create dashboard in Evidently')

In [61]:
search_results = search(**arguments)
search_results[:1]

[{'start': 34500,
  'content': '|last2=Honeycutt |first2=Rodney L. |title=Phylogenetic Relationships, Ecological Correlates, and Molecular Evolution Within the Cavioidea (Mammalia, Rodentia) |journal=Molecular Biology and Evolution |date=March 2002 |volume=19 |issue=3 |pages=263–277 |doi=10.1093/oxfordjournals.molbev.a004080 |pmid=11861886 |doi-access=free }}</ref>\n<ref name="Vucetich 2005">{{cite journal |last1=Vucetich |first1=María G. |last2=Deschamps |first2=Cecilia M. |last3=Olivares |first3=Itatí |last4=Dozo |first4=María T. |title=Capybaras, size, shape, and time: A model kit |journal=Acta Palaeontologica Polonica |volume=50 |issue=2 |year=2005 |pages=259–272 |url=https://www.app.pan.pl/article/item/app50-259.html }}</ref>\n<ref name="Deschamp 2007">{{cite journal |last1=Deschamps |first1=Cecilia M. |last2=Olivares |first2=Itatí |last3=Vieytes |first3=Emma Carolina |last4=Vucetich |first4=María Guiomar |title=Ontogeny and diversity of the oldest capybaras (Rodentia: Hydrochoeri

In [62]:
call_output = {
    "type": "function_call_output",
    "call_id": tool_call.call_id,
    "output": json.dumps(search_results),
}

In [63]:
messages.append(call_output)
messages

[{'role': 'system',
  'content': "\nYou're a documentation assistant. \n\nAnswer the user question using the documentation knowledge base\n\nUse only facts from the knowledge base when answering.\nIMPORTANT: f you cannot find the answer, inform the user.\n"},
 {'role': 'user',
  'content': 'What is this page about? https://en.wikipedia.org/wiki/Capybara'},
 ResponseFunctionToolCall(arguments='{"query":"Capybara site:en.wikipedia.org"}', call_id='call_Q0eqREjHYQ9LyVSOO33tNCqg', name='search', type='function_call', id='fc_045211626be0c69b00698f555282a481969ebef012ac10add2', status='completed'),
 {'type': 'function_call_output',
  'call_id': 'call_Q0eqREjHYQ9LyVSOO33tNCqg',
  'output': '[{"start": 34500, "content": "|last2=Honeycutt |first2=Rodney L. |title=Phylogenetic Relationships, Ecological Correlates, and Molecular Evolution Within the Cavioidea (Mammalia, Rodentia) |journal=Molecular Biology and Evolution |date=March 2002 |volume=19 |issue=3 |pages=263\\u2013277 |doi=10.1093/oxford

In [64]:
response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=messages,
    tools=[search_tool],
)

response.usage.input_tokens

5226

In [65]:
print(response.output_text)

The Wikipedia page for the **Capybara** provides comprehensive information about this large semi-aquatic rodent, scientifically named *Hydrochoerus hydrochaeris*. It covers various aspects, including its anatomy, behavior, habitat, diet, and ecological significance. Additionally, the page touches on the capybara's cultural impact and presence in social media as a meme.

Key points include:
- **Physical Description**: Capybaras are known as the largest rodents in the world, characterized by their barrel-shaped bodies and webbed feet.
- **Habitat**: They inhabit areas near water, such as swamps and rivers, primarily in South America.
- **Diet**: Being herbivores, they primarily eat grasses and aquatic plants.
- **Social Behavior**: Capybaras live in groups and exhibit social behavior, often seen resting in water or basking in the sun together.
- **Cultural Significance**: In recent years, they have gained popularity as internet icons and symbols of relaxation and calmness.

For further d

In [66]:
from typing import Literal
from pydantic import BaseModel, Field


class RAGResponse(BaseModel):
    """
    This model provides a structured answer with metadata about the response,
    including confidence, categorization, and follow-up suggestions.
    """

    answer: str = Field(description="The main answer to the user's question in markdown")
    found_answer: bool = Field(description="True if relevant information was found in the documentation")
    confidence: float = Field(description="Confidence score from 0.0 to 1.0 indicating how certain the answer is")
    confidence_explanation: str = Field(description="Explanation about the confidence level")
    answer_type: Literal["how-to", "explanation", "troubleshooting", "comparison", "reference"] = Field(description="The category of the answer")
    followup_questions: list[str] = Field(description="Suggested follow-up questions the user might want to ask")


In [67]:
response = openai_client.responses.parse(
    model='gpt-4o-mini',
    input=messages,
    tools=[search_tool],
    text_format=RAGResponse
)

response.usage.input_tokens

5450

In [68]:
4299 - 4075

224

In [69]:
rag_response = response.output_parsed
print(rag_response.answer)

The Wikipedia page on **Capybara** provides comprehensive information about this species, which is the largest rodent in the world. It covers various aspects, including:

1. **Taxonomy and Species**: The scientific classification and differentiation between the common capybara (*Hydrochoerus hydrochaeris*) and lesser capybara (*Hydrochoerus isthmius*).
2. **Habitat**: The capybara is semi-aquatic, typically found in habitat close to water such as rivers, lakes, and marshes across South America.
3. **Diet**: They are herbivores, primarily feeding on grasses and other aquatic plants.
4. **Behavior**: The page discusses social behavior, reproduction, and communication among capybaras.
5. **Cultural Significance**: Capybaras have gained popularity in internet culture and memes, seen as symbols of calm and community.
6. **Conservation Status**: Information on their conservation status and any related threats.
7. **Further Reading**: Links to external resources, studies, and articles about c