In [17]:
import json
from minsearch import AppendableIndex

In [21]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [22]:
index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x736b2f199ac0>

In [27]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [28]:
results = search('I just discovered the course. Can I join now?')
print(results[0]['text'])

Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


In [29]:
from openai import OpenAI
client = OpenAI()

In [26]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [30]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
rag('I just discovered the course. Can I join now?')

"Yes, you can still join the course now. Even if you don't register, you're eligible to submit homework assignments. However, keep in mind that there will be deadlines for final projects, so it’s best not to leave everything until the last minute."

In [32]:
rag('how do I run docker on gentoo?')

"The provided context does not include specific information or instructions on how to run Docker on Gentoo. Therefore, I cannot provide an answer based on the existing context. Please refer to Gentoo's official documentation or community resources for guidance on running Docker on that specific operating system."

In [49]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()

In [38]:
question = "how do I run docker on gentoo?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how do I run docker on gentoo?
</QUESTION>

<CONTEXT> 
EMPTY
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}

If you can answer the QUESTION using CONTEXT, use this template:

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}

If the context doesn't contain the answer, use your own knowledge to answer the question

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}


In [39]:
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To run Docker on Gentoo, you can follow these steps:\n\n1. **Install Docker**: You can install Docker on Gentoo using the package manager. Open your terminal and run:\n   ```
   emerge app-emulation/docker\n   ```\n   This command will download and install Docker along with its dependencies.\n\n2. **Add User to Docker Group**: After the installation, add your user to the 'docker' group to manage Docker without needing root permissions:\n   ```
   usermod -aG docker your_username\n   ```\n   Remember to replace 'your_username' with your actual username.\n\n3. **Start Docker Daemon**: You need to start the Docker service. You can do that with systemd or OpenRC, depending on your setup. For OpenRC, run:\n   ```
   rc-service docker start\n   ```\n   For systemd, use:\n   ```
   systemctl start docker\n   ```\n\n4. **Enable Docker to Start at Boot**: You may want Docker to start automatically when the system boots. For OpenRC, run:\n   ```
   rc-update add 

In [40]:
question = "how do I join the course?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
answer = llm(prompt)
print(answer)

{
"action": "SEARCH",
"reasoning": "The student asked how to join the course, but the context does not provide information on enrollment procedures. Therefore, I need to refer to the FAQ database for accurate instructions."
}


In [41]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context.strip()

In [42]:
search_results = search(question)
context = build_context(search_results)
prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how do I join the course?
</QUESTION>

<CONTEXT> 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course start

In [43]:
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To join the course, you need to register before the start date using the provided registration link. Additionally, you should subscribe to the course public Google Calendar to stay updated and join the course's Telegram channel for announcements. Remember, even if you miss the registration deadline, you are still eligible to submit your homework assignments.",
"source": "CONTEXT"
}


In [50]:
def agentic_rag_v1(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    if answer['action'] == 'SEARCH':
        print('need to perform search...')
        search_results = search(question)
        context = build_context(search_results)
        
        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(answer)

    return answer

In [51]:
agentic_rag_v1('how do I join the course?')

{'action': 'ANSWER', 'answer': "To join the course, you typically need to visit the course's official website or the platform it's being offered on. Look for a 'Sign Up' or 'Enroll' button, fill out the registration form, and follow the instructions provided to complete your enrollment. If there is an application process or prerequisites, be sure to review those beforehand.", 'source': 'OWN_KNOWLEDGE'}


{'action': 'ANSWER',
 'answer': "To join the course, you typically need to visit the course's official website or the platform it's being offered on. Look for a 'Sign Up' or 'Enroll' button, fill out the registration form, and follow the instructions provided to complete your enrollment. If there is an application process or prerequisites, be sure to review those beforehand.",
 'source': 'OWN_KNOWLEDGE'}

In [52]:
agentic_rag_v1('how patch KDE under FreeBSD?')

{'action': 'ANSWER', 'answer': "To patch KDE under FreeBSD, you can follow these general steps: 1. First, ensure you have the necessary development tools installed. You can do this by installing the `ports` collection if you haven't already. You can enable ports in the `/etc/make.conf` file by adding `WITH_PKGNG=yes`. 2. Navigate to the KDE ports directory that corresponds to the version of KDE you want to patch. You can find KDE ports in `/usr/ports/x11/kde5`, for example. 3. Apply your patch files to the source code. You can use the `patch` command, which takes the patch file and the target file as arguments. For example: `patch < my_patch.patch`. 4. After applying the patch, navigate to the port directory and update the port with `make install clean` to rebuild and install the patched version. 5. Finally, restart your KDE session to apply the changes. Always ensure you have backups and test patches in a development environment when possible.", 'source': 'OWN_KNOWLEDGE'}


{'action': 'ANSWER',
 'answer': "To patch KDE under FreeBSD, you can follow these general steps: 1. First, ensure you have the necessary development tools installed. You can do this by installing the `ports` collection if you haven't already. You can enable ports in the `/etc/make.conf` file by adding `WITH_PKGNG=yes`. 2. Navigate to the KDE ports directory that corresponds to the version of KDE you want to patch. You can find KDE ports in `/usr/ports/x11/kde5`, for example. 3. Apply your patch files to the source code. You can use the `patch` command, which takes the patch file and the target file as arguments. For example: `patch < my_patch.patch`. 4. After applying the patch, navigate to the port directory and update the port with `make install clean` to rebuild and install the patched version. 5. Finally, restart your KDE session to apply the changes. Always ensure you have backups and test patches in a development environment when possible.",
 'source': 'OWN_KNOWLEDGE'}

## Agentic search

In [53]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.


Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}


<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [72]:
question = "how do I join the course?"

search_queries = []
search_results = []
previous_actions = []
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=1
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration numbe

In [73]:
answer_json = llm(prompt)
answer = json.loads(answer_json)

In [74]:
previous_actions.append(answer)

In [75]:
previous_actions

[{'action': 'SEARCH',
  'reasoning': 'To provide specific instructions on how to join the course, I need to find relevant information from the FAQ database regarding enrollment procedures or joining processes.',
  'keywords': ['how to enroll in the course',
   'joining procedures for the course',
   'course registration instructions']}]

In [76]:
print(json.dumps(answer, indent=2))

{
  "action": "SEARCH",
  "reasoning": "To provide specific instructions on how to join the course, I need to find relevant information from the FAQ database regarding enrollment procedures or joining processes.",
  "keywords": [
    "how to enroll in the course",
    "joining procedures for the course",
    "course registration instructions"
  ]
}


In [77]:
keywords = answer['keywords']
search_queries.extend(keywords)

In [78]:
for k in keywords:
    res = search(k)
    search_results.extend(res)

In [79]:
len(search_results)

15

In [80]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

In [81]:
search_results = dedup(search_results)

In [82]:
len(search_results)

6

In [83]:
# question = "how do I join the course?"

# search_queries = []
# search_results = []
# previous_actions = []
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=2
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration numbe

In [84]:
answer_json = llm(prompt)
answer = json.loads(answer_json)
print(json.dumps(answer, indent=2))

{
  "action": "ANSWER_CONTEXT",
  "answer": "To join the course, you need to register before it starts using the provided registration link and join the course's public Google Calendar as well as its Telegram channel for announcements. You don't need a confirmation email; you are accepted once you register, and you can start learning and submitting homework immediately without being on a registered list.",
  "source": "CONTEXT"
}


In [86]:
question = "what do I need to do to be successful at module 1?"

search_queries = []
search_results = []
previous_actions = []


iteration = 0

while True:
    print(f'ITERATION #{iteration}...')

    context = build_context(search_results)
    prompt = prompt_template.format(
        question=question,
        context=context,
        search_queries="\n".join(search_queries),
        previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
        max_iterations=3,
        iteration_number=iteration
    )

    print(prompt)

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(json.dumps(answer, indent=2))

    previous_actions.append(answer)

    action = answer['action']
    if action != 'SEARCH':
        break

    keywords = answer['keywords']
    search_queries = list(set(search_queries) | set(keywords))
    
    for k in keywords:
        res = search(k)
        search_results.extend(res)

    search_results = dedup(search_results)
    
    iteration = iteration + 1
    if iteration >= 4:
        break

    print()


ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current

In [87]:
def agentic_search(question):
    search_queries = []
    search_results = []
    previous_actions = []

    iteration = 0
    
    while True:
        print(f'ITERATION #{iteration}...')
    
        context = build_context(search_results)
        prompt = prompt_template.format(
            question=question,
            context=context,
            search_queries="\n".join(search_queries),
            previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
            max_iterations=3,
            iteration_number=iteration
        )
    
        print(prompt)
    
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(json.dumps(answer, indent=2))

        previous_actions.append(answer)
    
        action = answer['action']
        if action != 'SEARCH':
            break
    
        keywords = answer['keywords']
        search_queries = list(set(search_queries) | set(keywords))

        for k in keywords:
            res = search(k)
            search_results.extend(res)
    
        search_results = dedup(search_results)
        
        iteration = iteration + 1
        if iteration >= 4:
            break
    
        print()

    return answer

In [88]:
agentic_search('how do I prepare for the course?')

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current

{'action': 'ANSWER',
 'answer': "To prepare for the course, consider the following strategies: \n1. **Register Early**: Make sure you register for the course before it starts to gain access to all materials. \n2. **Join Communication Channels**: Subscribe to the course public Google Calendar, and join the Telegram channel for announcements to stay updated.\n3. **Set Up Required Tools**: Familiarize yourself with Git/GitHub as mentioned in the FAQs, since coding and collaboration via repositories will be necessary. Watch tutorials if you're unsure how to clone repositories or set up your environment.\n4. **Review Course Materials**: Since materials will be available after the course finishes, you can start familiarizing yourself with them in advance. \n5. **Participate Actively**: Engage in live sessions and peer reviews to understand concepts better, as interaction enhances learning.\n6. **Study Habits**: Develop a consistent study schedule and utilize additional resources. For example

In [94]:
print(globals()['answer'])

{'action': 'ANSWER', 'answer': "To be successful in Module 1: Docker and Terraform, here are some general tips that can help:\n\n1. **Understand the Basics**: Familiarize yourself with the fundamental concepts of Docker and Terraform. Ensure you can define terms like containers, images, infrastructure as code, etc.\n\n2. **Hands-on Practice**: Actively engage with the tools. Set up Docker and Terraform on your local machine and practice with sample projects. Build small applications using Docker and try to manage infrastructure using Terraform.\n\n3. **Follow Tutorials**: Utilize available tutorials that guide you through common tasks and configurations within Docker and Terraform.\n\n4. **Engage with the Community**: Participate in forums, discussion groups, or study sessions with other students. Sharing experiences and troubleshooting together can enhance your learning.\n\n5. **Learn from Errors**: Don't be discouraged by errors or challenges faced in the module. Each obstacle is a l

## Tools (function calling)

https://platform.openai.com/docs/guides/function-calling

    def search(query):
        boost = {'question': 3.0, 'section': 0.5}
    
        results = index.search(
            query=query,
            filter_dict={'course': 'data-engineering-zoomcamp'},
            boost_dict=boost,
            num_results=5,
            output_ids=True
        )
    
        return results


In [95]:
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }

}

In [96]:
tools = [search_tool]

In [97]:
question = "How do I do well in module 1?"

In [98]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [99]:
response

Response(id='resp_687765102a3881a1a665a55fdc01ce2f06cdef125cb6f434', created_at=1752655120.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_BdJd6v1O9d7fEq8hGzdyfQka', name='search', type='function_call', id='fc_68776510c1fc81a19702560302a0640e06cdef125cb6f434', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[FunctionTool(name='search', parameters={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search query text to look up in the course FAQ.'}}, 'required': ['query'], 'additionalProperties': False}, strict=True, type='function', description='Search the FAQ database')], top_p=1.0, background=False, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, reasoning=Reasoning(effort=None, generate_summary=None, summary=

In [100]:
response.output

[ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_BdJd6v1O9d7fEq8hGzdyfQka', name='search', type='function_call', id='fc_68776510c1fc81a19702560302a0640e06cdef125cb6f434', status='completed')]

In [101]:
# response.choices[0].message.content
calls = response.output

In [102]:
call = calls[0]
call

ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_BdJd6v1O9d7fEq8hGzdyfQka', name='search', type='function_call', id='fc_68776510c1fc81a19702560302a0640e06cdef125cb6f434', status='completed')

In [103]:
call.call_id

'call_BdJd6v1O9d7fEq8hGzdyfQka'

In [104]:
f_name = call.name
f_name

'search'

In [105]:
arguments = json.loads(call.arguments)
arguments

{'query': 'how to do well in module 1'}

In [106]:
f = locals()[f_name]

In [107]:
results = f(**arguments)

In [108]:
search_results = json.dumps(results, indent=2)
print(search_results)

[
  {
    "text": "Issue:\ne\u2026\nSolution:\npip install psycopg2-binary\nIf you already have it, you might need to update it:\npip install psycopg2-binary --upgrade\nOther methods, if the above fails:\nif you are getting the \u201c ModuleNotFoundError: No module named 'psycopg2' \u201c error even after the above installation, then try updating conda using the command conda update -n base -c defaults conda. Or if you are using pip, then try updating it before installing the psycopg packages i.e\nFirst uninstall the psycopg package\nThen update conda or pip\nThen install psycopg again using pip.\nif you are still facing error with r pcycopg2 and showing pg_config not found then you will have to install postgresql. in MAC it is brew install postgresql",
    "section": "Module 1: Docker and Terraform",
    "question": "Postgres - ModuleNotFoundError: No module named 'psycopg2'",
    "course": "data-engineering-zoomcamp",
    "_id": 112
  },
  {
    "text": "Following dbt with BigQuery o

In [109]:
chat_messages.append(call)

In [110]:
chat_messages.append({
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": search_results,
})

In [111]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [112]:
r = response.output[0]

In [113]:
print(r.content[0].text)

To do well in Module 1 of your course, which focuses on Docker and Terraform, consider the following strategies:

1. **Understand Key Concepts**:
   - Familiarize yourself with Docker and Terraform fundamentals. Ensure you understand how containers work, as well as the basics of Infrastructure as Code (IaC) using Terraform.

2. **Hands-on Practice**:
   - Set up Docker and create your first container. Practice commands like `docker run`, `docker-compose`, and learn to manage images.
   - Work through examples on Terraform to create resources in your cloud environment. Start with simple infrastructure setups and gradually move to more complex configurations.

3. **Follow Course Materials**:
   - Carefully read the course materials, including any README files and documentation related to the exercises in Module 1. 

4. **Solve Common Issues**:
   - Be prepared to troubleshoot common errors. For instance, you might encounter the "ModuleNotFoundError" related to libraries like `psycopg2`. 

In [114]:
r.type

'message'

In [115]:
call.type

'function_call'

### Multiple calls

In [116]:
def do_call(tool_call_response):
    function_name = tool_call_response.name
    arguments = json.loads(tool_call_response.arguments)

    f = globals()[function_name]
    result = f(**arguments)

    return {
        "type": "function_call_output",
        "call_id": tool_call_response.call_id,
        "output": json.dumps(result, indent=2),
    }

In [117]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [118]:
response

Response(id='resp_687765b5c514819ca21c3a98118ea1c604ab1cc29eab414c', created_at=1752655285.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"module 1 tips for success"}', call_id='call_S4GeGhDIlnLBGRrGaafLw9L0', name='search', type='function_call', id='fc_687765b66b48819c93e333dc4165a9ee04ab1cc29eab414c', status='completed'), ResponseFunctionToolCall(arguments='{"query":"module 1 study strategies"}', call_id='call_ezam4h3XgQxQ2wJXN9m8pRIh', name='search', type='function_call', id='fc_687765b6cac4819cb1fcd01ecf3b02e904ab1cc29eab414c', status='completed'), ResponseFunctionToolCall(arguments='{"query":"module 1 assessment overview"}', call_id='call_FNZnwxDzRx1m0JouZtYJUviZ', name='search', type='function_call', id='fc_687765b71e40819c86f6006b3abee77904ab1cc29eab414c', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[Funct

In [119]:
for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.text) 

function_call
function_call
function_call


In [120]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)
    print()

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.content[0].text) 

message

To excel in Module 1, here are some key strategies and tips:

### 1. Understand the Course Material
- **Review Lecture Notes**: Make sure to review all lecture slides and notes provided. Understanding the foundational concepts is crucial.
- **Read Recommended Resources**: Go through any additional readings suggested by the instructor.

### 2. Practical Exercises
- **Hands-on Practice**: Engage in any coding exercises or practical implementations given in the module, especially with tools like Docker and Terraform.
- **Code Along with Examples**: Follow along with any coding demonstrations provided in the lectures.

### 3. Troubleshooting Common Issues
- **Install Necessary Packages**: Ensure you have all required packages installed. For example, if you encounter a `ModuleNotFoundError` for `psycopg2`, install it using:
  ```bash
  pip install psycopg2-binary
  ```
- **Keep Software Updated**: Regularly update your installations, using:
  ```bash
  pip install --upgrade psycopg

### Putting it all together

Have two loops:

- First is the main Q&A loop - ask question, get back the answer
- Second is the request loop - send requests until there's a message reply from API

In [121]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

In [122]:
while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False
        
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                print('function_call:', entry)
                print()
                result = do_call(entry)
                chat_messages.append(result)
            elif entry.type == 'message':
                print(entry.content[0].text)
                print()
                has_messages = True

        if has_messages:
            break

 How do I do my best in Module 2?


function_call: ResponseFunctionToolCall(arguments='{"query":"best practices for Module 2"}', call_id='call_MhTI4UfXHYoiEQQCzlOWiWqB', name='search', type='function_call', id='fc_6877674dd42481a0b2c24f466a139c540d8a7f064d673908', status='completed')

function_call: ResponseFunctionToolCall(arguments='{"query":"Module 2 best practices study tips"}', call_id='call_9lcpW6gGa8I6Ckj0kzFDaSi8', name='search', type='function_call', id='fc_6877674eddd081a0b7b77f73e41221f70d8a7f064d673908', status='completed')

To excel in Module 2, here are some strategies you can consider:

1. **Understand the Content**: Make sure you have a clear understanding of the topics covered in the module. This might include tools or technologies introduced in this module. If there are any best practices mentioned, such as in the Docker documentation, review those thoroughly.

2. **Hands-On Practice**: Engage in practical exercises. Try to replicate examples and work through any provided assignments or lab exercises. T

 3. Utilize resources


function_call: ResponseFunctionToolCall(arguments='{"query":"recommended resources for Module 2"}', call_id='call_WszplioFrMyx5UWjGfz4W73t', name='search', type='function_call', id='fc_687767868b4881a09cbfd1b2efd1ed820d8a7f064d673908', status='completed')

Here are some recommended resources to help you excel in Module 2:

1. **Official Documentation**: Make it a habit to refer to the official documentation of the tools or technologies covered in Module 2. This can provide detailed insights and examples not covered in course materials.

2. **Online Courses and Tutorials**: Platforms like Coursera, Udacity, or edX may have supplementary courses that cover the same topics. This could provide different perspectives or use cases.

3. **YouTube Tutorials**: There are many tutorial channels focusing on data engineering tools (like Kafka, Spark, etc.) which can reinforce your understanding through visual aids.

4. **Forums and Community Support**: Engage in discussions on platforms like Stack

 stop


Same using widgets

In [123]:
from IPython.display import display, HTML
import markdown # pip install markdown

In [125]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

# Chat loop
while True:
    question = input()
    if question.strip().lower() == 'stop':
        print("Chat ended.")
        break
    print()

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True:  # inner request loop
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False

        for entry in response.output:
            chat_messages.append(entry)

            if entry.type == "function_call":
                result = do_call(entry)
                chat_messages.append(result)
                display_function_call(entry, result)

            elif entry.type == "message":
                display_response(entry)
                has_messages = True

        if has_messages:
            break

 I need a transcript of all the lecture videos in Module 2.





NameError: name 'display_function_call' is not defined

## Adding more tools

In [126]:
def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

In [127]:
add_entry_description = {
    "type": "function",
    "name": "add_entry",
    "description": "Add an entry to the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question to be added to the FAQ database",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question",
            }
        },
        "required": ["question", "answer"],
        "additionalProperties": False
    }
}

In [128]:
import chat_assistant

In [129]:
tools = chat_assistant.Tools()
tools.add_tool(search, search_tool)

In [130]:
tools.get_tools()


[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}}]

In [131]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_interface = chat_assistant.ChatInterface()

chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client
)

In [132]:
chat.run()

You: Can I still join the course?


You: When is the deadline for the project?


You: stop


Chat ended.


In [133]:
tools.add_tool(add_entry, add_entry_description)
tools.get_tools()

[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}},
 {'type': 'function',
  'name': 'add_entry',
  'description': 'Add an entry to the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'question': {'type': 'string',
     'description': 'The question to be added to the FAQ database'},
    'answer': {'type': 'string', 'description': 'The answer to the question'}},
   'required': ['question', 'answer'],
   'additionalProperties': False}}]

In [134]:
chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client
)

In [135]:
chat.run()

You: How do I fail in Module 1?


You: Add this back to the FAQ


You: index[-1]


You: stop


Chat ended.


In [137]:
index.docs[-1]

{'question': 'How do I fail in Module 1?',
 'text': "To fail in Module 1, you would likely need to demonstrate consistent poor performance in assessments, quizzes, and other evaluations. Specifically, the following actions could contribute to a failure: 1. Not Completing Assignments: Failing to submit any assignments or assessments for Module 1 would directly impact your grade. 2. Consistently Poor Performance: Scoring low grades on quizzes, exams, or practical assignments might lead to an unsuccessful completion of the module. 3. Disregarding Participation Requirements: Failing to engage in group discussions or activities, if required, could also lead to a lower overall evaluation. 4. Not Seeking Help: Ignoring feedback or failing to seek help when facing challenges could lead to continued misunderstanding of the course content. It's important to manage your time well, actively engage with the materials, and seek assistance when needed to avoid these pitfalls.",
 'section': 'user adde

In [138]:
pip install pydantic-ai

Collecting pydantic-ai
  Downloading pydantic_ai-0.4.2-py3-none-any.whl.metadata (11 kB)
Collecting pydantic-ai-slim==0.4.2 (from pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,google,groq,mcp,mistral,openai,vertexai]==0.4.2->pydantic-ai)
  Downloading pydantic_ai_slim-0.4.2-py3-none-any.whl.metadata (3.8 kB)
Collecting eval-type-backport>=0.2.0 (from pydantic-ai-slim==0.4.2->pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,google,groq,mcp,mistral,openai,vertexai]==0.4.2->pydantic-ai)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting griffe>=1.3.2 (from pydantic-ai-slim==0.4.2->pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,google,groq,mcp,mistral,openai,vertexai]==0.4.2->pydantic-ai)
  Downloading griffe-1.7.3-py3-none-any.whl.metadata (5.0 kB)
Collecting opentelemetry-api>=1.28.0 (from pydantic-ai-slim==0.4.2->pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,google,groq,mcp,mistral,openai,vertexai]==0.4.2->pydantic-ai)
  Downloading 

In [139]:
from pydantic_ai import Agent, RunContext

In [140]:
chat_agent = Agent(  
    'openai:gpt-4o-mini',
    system_prompt=developer_prompt
)

In [141]:
from typing import Dict

In [142]:
@chat_agent.tool
def search_tool(ctx: RunContext, query: str) -> Dict[str, str]:
    """
    Search the FAQ for relevant entries matching the query.

    Parameters
    ----------
    query : str
        The search query string provided by the user.

    Returns
    -------
    list
        A list of search results (up to 5), each containing relevance information 
        and associated output IDs.
    """
    print(f"search('{query}')")
    return search(query)

In [143]:
@chat_agent.tool
def add_entry_tool(ctx: RunContext, question: str, answer: str) -> None:
    """
    Add a new question-answer entry to FAQ.

    This function creates a document with the given question and answer, 
    tagging it as user-added content.

    Parameters
    ----------
    question : str
        The question text to be added to the index.

    answer : str
        The answer or explanation corresponding to the question.

    Returns
    -------
    None
    """
    return add_entry(question, answer)

In [144]:
user_prompt = "I just discovered the course. Can I join now?"
agent_run = await chat_agent.run(user_prompt)
print(agent_run.output)

search('course enrollment')
Yes, you can join the course! However, you will need to register before the course starts. The course begins on January 15, 2024, at 17:00. Make sure to sign up using the registration link that is provided by the course.

Would you like to know how to register or any other details about the course?
