In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


In [5]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [2]:
# !pip install minsearch 

from minsearch import AppendableIndex

index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)


<minsearch.append.AppendableIndex at 0x701887b7d280>

In [3]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results



In [4]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [18]:
from openai import OpenAI
openai_api_key = "sk-proj-"
client = OpenAI(api_key=openai_api_key)

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer


In [19]:
question = "Can I still start the course?"

search_results = search(question)
prompt = build_prompt(question, search_results)
llm(prompt)


"Yes, you can still start the course after the start date. Even without registering, you are eligible to submit homework. However, be mindful of the deadlines for turning in final projects, so it's best not to leave everything to the last minute."

In [12]:
from openai import OpenAI
client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer


In [21]:
rag("How do I patch KDE under FreeBSD?")

"I'm sorry, but there is no information provided in the CONTEXT regarding how to patch KDE under FreeBSD. Please check the official documentation or community forums for detailed instructions."

In [22]:
llm("How do I patch KDE under FreeBSD?")

"Patching KDE (K Desktop Environment) under FreeBSD involves a few steps, typically utilizing FreeBSD’s ports system or package management functionalities. Below are the general steps to apply a patch to KDE under FreeBSD:\n\n### Step 1: Ensure FreeBSD is Updated\n\nFirst, make sure your FreeBSD system is up-to-date:\n\n```sh\nsudo pkg update\nsudo pkg upgrade\n```\n\n### Step 2: Install KDE via Ports or Packages\n\nDecide whether you are using the ports system or binary packages. If you haven't installed KDE yet, you can:\n\n- **Using Ports:**\n  ```sh\n  cd /usr/ports/x11/kde5\n  make install clean\n  ```\n\n- **Using Packages:**\n  ```sh\n  sudo pkg install kde5\n  ```\n\n### Step 3: Locate the KDE Install Directory\n\nIf you installed KDE through ports, it's usually located in `/usr/local`. If through packages, the files will still be in `/usr/local`.\n\n### Step 4: Download or Create the Patch\n\nObtain the patch you wish to apply. Patches may be distributed as `.diff` or `.patch`

In [34]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()


In [37]:
import json

question = "Can I still join the course?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
answer_json = llm(prompt)
answer = json.loads(answer_json)

In [38]:
answer

{'action': 'SEARCH',
 'reasoning': 'The question about joining the course may depend on factors such as enrollment deadlines or prerequisites, which are not provided in the current context.'}

In [40]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context.strip()

search_results = search(question)
context = build_context(search_results)
prompt = prompt_template.format(question=question, context=context)
answer_json = llm(prompt)
answer_json

'{\n"action": "ANSWER",\n"answer": "Yes, you can still join the course even after the start date. You\'re eligible to submit homework assignments, but keep in mind that there will be deadlines for the final projects, so it\'s best not to wait until the last minute to catch up.",\n"source": "CONTEXT"\n}'

In [43]:
def agentic_rag_v1(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    # print(answer)

    if answer['action'] == 'SEARCH':
        print('need to perform search...')
        search_results = search(question)
        context = build_context(search_results)
        
        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        # print(answer)

    return answer


question = "How is the weather?"

agentic_rag_v1(question)

need to perform search...


{'action': 'SEARCH',
 'reasoning': 'The question pertains to current weather conditions, which are not available in the provided context. Therefore, a search for the latest weather data is necessary.'}

In [44]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.

Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}

<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()


In [49]:
question = "How do I excel in module 1?"

context = build_context(search_results)
max_iterations = 3
iteration_number = 0
search_queries = []
search_results = []
previous_actions = []

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=1
)

print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration number

In [61]:
answer_json = llm(prompt)
answer = json.loads(answer_json)
keywords = answer['keywords']
print(keywords)
print(answer)

['tips to excel in module 1', 'how to do well in module 1', 'study strategies for module 1']
{'action': 'SEARCH', 'reasoning': 'To provide specific tips and resources that could help a student excel in module 1, I will search for strategies, study tips, or resources related to excelling in this module as it seems to be a common concern among students.', 'keywords': ['tips to excel in module 1', 'how to do well in module 1', 'study strategies for module 1']}


In [62]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result


for kw in keywords:
    search_queries.append(kw)
    res = search(kw)
    search_results.extend(res)


In [63]:
search_results = dedup(search_results)
search_results

[{'text': 'You need to look for the Py4J file and note the version of the filename. Once you know the version, you can update the export command accordingly, this is how you check yours:\n` ls ${SPARK_HOME}/python/lib/ ` and then you add it in the export command, mine was:\nexport PYTHONPATH=”${SPARK_HOME}/python/lib/Py4J-0.10.9.5-src.zip:${PYTHONPATH}”\nMake sure that the version under `${SPARK_HOME}/python/lib/` matches the filename of py4j or you will encounter `ModuleNotFoundError: No module named \'py4j\'` while executing `import pyspark`.\nFor instance, if the file under `${SPARK_HOME}/python/lib/` was `py4j-0.10.9.3-src.zip`.\nThen the export PYTHONPATH statement above should be changed to `export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH"` appropriately.\nAdditionally, you can check for the version of ‘py4j’ of the spark you’re using from here and update as mentioned above.\n~ Abhijit Chakraborty: Sometimes, even with adding the correct version of p

In [64]:
def agentic_search(question):
    search_queries = []
    search_results = []
    previous_actions = []

    iteration = 0
    
    while True:
        print(f'ITERATION #{iteration}...')
    
        context = build_context(search_results)
        prompt = prompt_template.format(
            question=question,
            context=context,
            search_queries="\n".join(search_queries),
            previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
            max_iterations=3,
            iteration_number=iteration
        )
    
        print(prompt)
    
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(json.dumps(answer, indent=2))

        previous_actions.append(answer)
    
        action = answer['action']
        if action != 'SEARCH':
            break
    
        keywords = answer['keywords']
        search_queries = list(set(search_queries) | set(keywords))

        for k in keywords:
            res = search(k)
            search_results.extend(res)
    
        search_results = dedup(search_results)
        
        iteration = iteration + 1
        if iteration >= 4:
            break
    
        print()

    return answer


In [67]:
answer = agentic_search('how do I prepare for the course?')
print("____________")
answer

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current 

{'action': 'ANSWER',
 'answer': "To prepare for the course, you should start by installing and setting up all the necessary dependencies. This includes having a Google Cloud account, Google Cloud SDK, Python 3 (which can be installed via Anaconda), Terraform, and Git. It's also advisable to go over the course prerequisites and syllabus to ensure you're comfortable with the topics that will be covered. Additionally, familiarize yourself with any recommended tools and resources listed in the course materials, so you can hit the ground running when the course starts.",
 'source': 'OWN_KNOWLEDGE'}

In [72]:
## Fuction calling

In [73]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results


In [74]:
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}


In [87]:
question = "How do I do well in module 1?"

# developer_prompt = """
# You're a course teaching assistant. 
# You're given a question from a course student and your task is to answer it.
# """.strip()

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)
calls = response.output
calls

[ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_yrLPTL2f33vxtIjGpT4Ee4db', name='search', type='function_call', id='fc_68795be91c0c819e98c6d9d5a10119e60145e8c522432a7f', status='completed'),
 ResponseFunctionToolCall(arguments='{"query":"tips for success in module 1"}', call_id='call_2SXEKJNJa0kogwaGVQttmhCI', name='search', type='function_call', id='fc_68795be96838819ea97a430426bc02eb0145e8c522432a7f', status='completed'),
 ResponseFunctionToolCall(arguments='{"query":"module 1 study strategies"}', call_id='call_AaoUJpZSYXVf2SU30taABypa', name='search', type='function_call', id='fc_68795be9aa34819ebb127bb3f7ae31060145e8c522432a7f', status='completed')]

In [93]:
for call in calls:
    f_name = call.name
    arguments = json.loads(call.arguments)
    f = globals()[f_name]  # => search()
    results = f(**arguments)        # => search({"query":"module 1 tips"})
    chat_messages .append(call)
    
    chat_messages.append({
        "type": "function_call_output",
        "call_id": call.call_id,
        "output": search_results,
    })

In [96]:
print(chat_messages)

[{'role': 'developer', 'content': "You're a course teaching assistant. \nYou're given a question from a course student and your task is to answer it.\nIf you look up something in FAQ, convert the student question into multiple queries."}, {'role': 'user', 'content': 'How do I do well in module 1?'}, ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_yrLPTL2f33vxtIjGpT4Ee4db', name='search', type='function_call', id='fc_68795be91c0c819e98c6d9d5a10119e60145e8c522432a7f', status='completed'), {'type': 'function_call_output', 'call_id': 'call_yrLPTL2f33vxtIjGpT4Ee4db', 'output': [{'text': 'You need to look for the Py4J file and note the version of the filename. Once you know the version, you can update the export command accordingly, this is how you check yours:\n` ls ${SPARK_HOME}/python/lib/ ` and then you add it in the export command, mine was:\nexport PYTHONPATH=”${SPARK_HOME}/python/lib/Py4J-0.10.9.5-src.zip:${PYTHONPATH}”\nMake sure that the ve

## Putting everything together

In [110]:
def do_call(tool_call_response):
    function_name = tool_call_response.name
    arguments = json.loads(tool_call_response.arguments)

    f = globals()[function_name]
    result = f(**arguments)

    return {
        "type": "function_call_output",
        "call_id": tool_call_response.call_id,
        "output": json.dumps(result, indent=2),
    }


developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]




while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False
        
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                print('function_call:', entry)
                print()
                result = do_call(entry)
                chat_messages.append(result)
            elif entry.type == 'message':
                print(entry.content[0].text)
                print()
                has_messages = True

        if has_messages:
            break



 how do I start course?


function_call: ResponseFunctionToolCall(arguments='{"query":"how to start course"}', call_id='call_pR7gdd3FrrScyuSvfHQ9F3Ch', name='search', type='function_call', id='fc_6879617f939881a1b8385c1b34156be204786ae638b708dd', status='completed')

To start the course, here are the steps you need to follow:

1. **Registration**: You can register before the course starts using the provided registration link. However, you can also start learning and submitting your homework without needing to register, as registration is primarily for gauging interest.

2. **Course Start Date**: The course officially starts on **15th January 2024 at 17:00** with a live session called "Office Hours."

3. **Prepare in Advance**: Before the course begins, you'll want to set up the necessary tools and accounts:
   - Create a Google Cloud account.
   - Install the Google Cloud SDK.
   - Install Python 3 (preferably using Anaconda).
   - Install Terraform and Git.
   - Review the prerequisites and course syllabus to 

 stop


## Using PydanticAI

In [118]:
# !pip install pydantic-ai

from pydantic_ai import Agent, RunContext
import os

os.environ["OPENAI_API_KEY"] = openai_api_key

chat_agent = Agent(  
    'openai:gpt-4o-mini',
    system_prompt=developer_prompt
)

from typing import Dict


@chat_agent.tool
def search_tool(ctx: RunContext, query: str) -> Dict[str, str]:
    """
    Search the FAQ for relevant entries matching the query.

    Parameters
    ----------
    query : str
        The search query string provided by the user.

    Returns
    -------
    list
        A list of search results (up to 5), each containing relevance information 
        and associated output IDs.
    """
    print(f"search('{query}')")
    return search(query)


@chat_agent.tool
def add_entry_tool(ctx: RunContext, question: str, answer: str) -> None:
    """
    Add a new question-answer entry to FAQ.

    This function creates a document with the given question and answer, 
    tagging it as user-added content.

    Parameters
    ----------
    question : str
        The question text to be added to the index.

    answer : str
        The answer or explanation corresponding to the question.

    Returns
    -------
    None
    """
    return add_entry(question, answer)


user_prompt = "I just discovered the course. Can I join now?"
agent_run = await chat_agent.run(user_prompt)
print(agent_run.output)



search('Can I join the course now?')
Yes, you can still join the course now! Even if you don't register, you're eligible to submit the homework assignments. However, keep in mind that there will be deadlines for turning in the final projects, so it's best to manage your time wisely and not leave everything for the last minute.

Would you like to know more about the course content or resources available?
