In [1]:
import json
from minsearch import AppendableIndex

In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x7eff19dd1d00>

In [4]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [5]:
results = search('I just discovered the course. Can I join now?')
print(results[0]['text'])

Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


In [5]:
from openai import OpenAI
client = OpenAI()

In [11]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [117]:
rag('I just discovered the course. Can I join now?')

KeyError: 'max_iterations'

In [14]:
rag('how do I run docker on gentoo?')

"The provided context does not include specific instructions on how to run Docker on Gentoo. Therefore, I cannot provide a direct answer to your question about running Docker on that operating system. If you need guidance on this topic, please consult Gentoo's official documentation or Docker's installation guide tailored for Gentoo."

In [15]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()

In [16]:
question = "how do I run docker on gentoo?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how do I run docker on gentoo?
</QUESTION>

<CONTEXT> 
EMPTY
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}

If you can answer the QUESTION using CONTEXT, use this template:

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}

If the context doesn't contain the answer, use your own knowledge to answer the question

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}


In [17]:
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To run Docker on Gentoo, you need to first install Docker. Here are the general steps: \n\n1. **Install Docker**: You can install Docker from the Portage tree. Use the following command: \n   ```bash\n   emerge app-emulation/docker\n   ```\n   Make sure to enable the necessary USE flags as per your requirement.\n\n2. **Add your user to the Docker group**: This allows you to run Docker commands without `sudo`. Use the following command to add your user: \n   ```bash\n   usermod -aG docker yourusername\n   ```\n   Remember to replace `yourusername` with your actual username.\n\n3. **Start the Docker service**: Enable Docker to start automatically at boot with the following command: \n   ```bash\n   rc-update add docker default\n   ```\n   Then start the Docker service: \n   ```bash\n   /etc/init.d/docker start\n   ```\n\n4. **Verify the installation**: You can confirm that Docker is running by executing: \n   ```bash\n   docker ps\n   ```\n   This should 

In [18]:
question = "how do I join the course?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
answer = llm(prompt)
print(answer)

{
"action": "SEARCH",
"reasoning": "The context is empty, so I cannot find the answer within it. I will need to refer to the FAQ database to find information about how to join the course."
}


In [19]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context.strip()

In [20]:
search_results = search(question)
context = build_context(search_results)
prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how do I join the course?
</QUESTION>

<CONTEXT> 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course start

In [21]:
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To join the course, you need to register before it starts. You can find the registration link in the course information, and be sure to also join the course public Google Calendar for updates. Additionally, make sure to join the course Telegram channel and register on DataTalks.Club's Slack to stay informed about announcements and course-related communication.",
"source": "CONTEXT"
}


In [22]:
def agentic_rag_v1(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    if answer['action'] == 'SEARCH':
        print('need to perform search...')
        search_results = search(question)
        context = build_context(search_results)
        
        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(answer)

    return answer

In [23]:
agentic_rag_v1('how do I join the course?')

{'action': 'ANSWER', 'answer': "To join the course, you typically need to visit the course's official website or the platform where it is hosted, register for an account if you don't have one, and then follow the registration process for the specific course you are interested in. This may involve paying a fee or submitting an application, depending on the course requirements.", 'source': 'OWN_KNOWLEDGE'}


{'action': 'ANSWER',
 'answer': "To join the course, you typically need to visit the course's official website or the platform where it is hosted, register for an account if you don't have one, and then follow the registration process for the specific course you are interested in. This may involve paying a fee or submitting an application, depending on the course requirements.",
 'source': 'OWN_KNOWLEDGE'}

In [24]:
agentic_rag_v1('how patch KDE under FreeBSD?')

{'action': 'ANSWER', 'answer': "To patch KDE under FreeBSD, you typically follow these steps: \n\n1. **Install the necessary tools**: Make sure you have 'git' and 'patch' installed in your FreeBSD system. You can install them using the package manager:\n   ```bash\n   pkg install git patch\n   ```\n\n2. **Get the source code**: If you haven't already, you need to obtain the KDE source code. You can find it in the FreeBSD ports collection, or you can clone it directly from the KDE repositories:\n   ```bash\n   git clone https://invent.kde.org/packaging/kde-freebsd.git\n   ```\n\n3. **Download the patch**: Obtain the patch file that you would like to apply to the source code. This could be from the KDE project website or a bug tracker.\n\n4. **Apply the patch**: Navigate to the directory where you have the KDE sources and apply the patch using the following command:\n   ```bash\n   patch -p1 < /path/to/your/patch/file.patch\n   ```\n   Ensure that you are using the correct path to your p

{'action': 'ANSWER',
 'answer': "To patch KDE under FreeBSD, you typically follow these steps: \n\n1. **Install the necessary tools**: Make sure you have 'git' and 'patch' installed in your FreeBSD system. You can install them using the package manager:\n   ```bash\n   pkg install git patch\n   ```\n\n2. **Get the source code**: If you haven't already, you need to obtain the KDE source code. You can find it in the FreeBSD ports collection, or you can clone it directly from the KDE repositories:\n   ```bash\n   git clone https://invent.kde.org/packaging/kde-freebsd.git\n   ```\n\n3. **Download the patch**: Obtain the patch file that you would like to apply to the source code. This could be from the KDE project website or a bug tracker.\n\n4. **Apply the patch**: Navigate to the directory where you have the KDE sources and apply the patch using the following command:\n   ```bash\n   patch -p1 < /path/to/your/patch/file.patch\n   ```\n   Ensure that you are using the correct path to your 

## Agentic search

In [25]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.


Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}


<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [37]:
question = "how do I join the course?"

search_queries = []
search_results = []
previous_actions = []
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=1
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration numbe

In [38]:
answer_json = llm(prompt)
answer = json.loads(answer_json)

In [39]:
previous_actions.append(answer)

In [40]:
previous_actions

[{'action': 'SEARCH',
  'reasoning': "The student is asking about how to join the course, which likely involves enrollment procedures or requirements that haven't been provided in the current context. More information from the FAQ database could clarify the process.",
  'keywords': ['how to join the course',
   'enrollment process',
   'course registration instructions']}]

In [41]:
print(json.dumps(answer, indent=2))

{
  "action": "SEARCH",
  "reasoning": "The student is asking about how to join the course, which likely involves enrollment procedures or requirements that haven't been provided in the current context. More information from the FAQ database could clarify the process.",
  "keywords": [
    "how to join the course",
    "enrollment process",
    "course registration instructions"
  ]
}


In [42]:
keywords = answer['keywords']
search_queries.extend(keywords)

In [43]:
for k in keywords:
    res = search(k)
    search_results.extend(res)

In [44]:
len(search_results)

15

In [45]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

In [46]:
search_results = dedup(search_results)

In [47]:
len(search_results)

11

In [48]:
# question = "how do I join the course?"

# search_queries = []
# search_results = []
# previous_actions = []
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=2
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration numbe

In [49]:
answer_json = llm(prompt)
answer = json.loads(answer_json)
print(json.dumps(answer, indent=2))

{
  "action": "ANSWER_CONTEXT",
  "answer": "To join the course, you need to register before the course starts using the provided link. The course will begin on January 15, 2024, at 17:00. You also have the option to join the course's Telegram channel for announcements and are encouraged to register in DataTalks.Club's Slack and join the relevant channel. Even if you don't register, you can still submit homework, but there are deadlines for the final projects, so it's best to manage your time wisely.",
  "source": "CONTEXT"
}


In [50]:
question = "what do I need to do to be successful at module 1?"

search_queries = []
search_results = []
previous_actions = []


iteration = 0

while True:
    print(f'ITERATION #{iteration}...')

    context = build_context(search_results)
    prompt = prompt_template.format(
        question=question,
        context=context,
        search_queries="\n".join(search_queries),
        previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
        max_iterations=3,
        iteration_number=iteration
    )

    print(prompt)

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(json.dumps(answer, indent=2))

    previous_actions.append(answer)

    action = answer['action']
    if action != 'SEARCH':
        break

    keywords = answer['keywords']
    search_queries = list(set(search_queries) | set(keywords))
    
    for k in keywords:
        res = search(k)
        search_results.extend(res)

    search_results = dedup(search_results)
    
    iteration = iteration + 1
    if iteration >= 4:
        break

    print()


ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current

In [51]:
def agentic_search(question):
    search_queries = []
    search_results = []
    previous_actions = []

    iteration = 0
    
    while True:
        print(f'ITERATION #{iteration}...')
    
        context = build_context(search_results)
        prompt = prompt_template.format(
            question=question,
            context=context,
            search_queries="\n".join(search_queries),
            previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
            max_iterations=3,
            iteration_number=iteration
        )
    
        print(prompt)
    
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(json.dumps(answer, indent=2))

        previous_actions.append(answer)
    
        action = answer['action']
        if action != 'SEARCH':
            break
    
        keywords = answer['keywords']
        search_queries = list(set(search_queries) | set(keywords))

        for k in keywords:
            res = search(k)
            search_results.extend(res)
    
        search_results = dedup(search_results)
        
        iteration = iteration + 1
        if iteration >= 4:
            break
    
        print()

    return answer

In [52]:
agentic_search('how do I prepare for the course?')

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.


Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current

{'action': 'ANSWER_CONTEXT',
 'answer': 'Before the course starts, you can prepare by installing and setting up all the necessary dependencies and requirements, which include a Google Cloud account, Google Cloud SDK, Python 3 (with Anaconda), Terraform, and Git. Additionally, review the prerequisites and the syllabus to ensure you are comfortable with the subjects that will be covered in the course.',
 'source': 'CONTEXT'}

In [53]:
print(globals()['answer'])

{'action': 'ANSWER', 'answer': "To be successful in Module 1, which focuses on Docker and Terraform, it is essential to understand the core concepts of these technologies, work on practical exercises, and be familiar with troubleshooting common issues. Here are some key steps you can take:\n\n1. **Study the Learning Material**: Make sure to go through the provided learning resources and documentation for Docker and Terraform thoroughly.\n\n2. **Hands-on Practice**: Install Docker and Terraform on your machine, and try out practical exercises to solidify your understanding.\n\n3. **Resolve Installation Issues**: Be prepared to troubleshoot installation or module errors. For example, common errors like 'ModuleNotFoundError' for certain Python packages should prompt you to install those packages correctly using pip or conda.\n\n4. **Check Network Settings**: If you encounter network-related issues (like failing to access provider packages), ensure that your VPN settings or firewall are no

## Tools (function calling)

https://platform.openai.com/docs/guides/function-calling

    def search(query):
        boost = {'question': 3.0, 'section': 0.5}
    
        results = index.search(
            query=query,
            filter_dict={'course': 'data-engineering-zoomcamp'},
            boost_dict=boost,
            num_results=5,
            output_ids=True
        )
    
        return results


In [54]:
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }

}

In [55]:
tools = [search_tool]

In [56]:
question = "How do I do well in module 1?"

In [57]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [58]:
response

Response(id='resp_6878420eab6c81a38fa77d4a7efa902601a79e85977e5feb', created_at=1752711694.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"module 1 tips"}', call_id='call_OhxNXDKj9HBqqrlGpoWapXuk', name='search', type='function_call', id='fc_6878420f5c8081a38daeafd26e17c4ce01a79e85977e5feb', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[FunctionTool(name='search', parameters={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search query text to look up in the course FAQ.'}}, 'required': ['query'], 'additionalProperties': False}, strict=True, type='function', description='Search the FAQ database')], top_p=1.0, background=False, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, reasoning=Reasoning(effort=None, generate_summary=None, summary=None), servic

In [59]:
response.output

[ResponseFunctionToolCall(arguments='{"query":"module 1 tips"}', call_id='call_OhxNXDKj9HBqqrlGpoWapXuk', name='search', type='function_call', id='fc_6878420f5c8081a38daeafd26e17c4ce01a79e85977e5feb', status='completed')]

In [60]:
# response.choices[0].message.content
calls = response.output

In [61]:
call = calls[0]
call

ResponseFunctionToolCall(arguments='{"query":"module 1 tips"}', call_id='call_OhxNXDKj9HBqqrlGpoWapXuk', name='search', type='function_call', id='fc_6878420f5c8081a38daeafd26e17c4ce01a79e85977e5feb', status='completed')

In [62]:
call.call_id

'call_OhxNXDKj9HBqqrlGpoWapXuk'

In [63]:
f_name = call.name
f_name

'search'

In [64]:
arguments = json.loads(call.arguments)
arguments

{'query': 'module 1 tips'}

In [65]:
f = locals()[f_name]

In [66]:
results = f(**arguments)

In [68]:
search_results = json.dumps(results, indent=2)
print(search_results)

[
  {
    "text": "Following dbt with BigQuery on Docker readme.md, after `docker-compose build` and `docker-compose run dbt-bq-dtc init`, encountered error `ModuleNotFoundError: No module named 'pytz'`\nSolution:\nAdd `RUN python -m pip install --no-cache pytz` in the Dockerfile under `FROM --platform=$build_for python:3.9.9-slim-bullseye as base`",
    "section": "Module 4: analytics engineering with dbt",
    "question": "DBT - Error: No module named 'pytz' while setting up dbt with docker",
    "course": "data-engineering-zoomcamp",
    "_id": 299
  },
  {
    "text": "Issue:\ne\u2026\nSolution:\npip install psycopg2-binary\nIf you already have it, you might need to update it:\npip install psycopg2-binary --upgrade\nOther methods, if the above fails:\nif you are getting the \u201c ModuleNotFoundError: No module named 'psycopg2' \u201c error even after the above installation, then try updating conda using the command conda update -n base -c defaults conda. Or if you are using pip, t

In [67]:
chat_messages.append(call)

In [69]:
chat_messages.append({
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": search_results,
})

In [70]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [71]:
r = response.output[0]

In [72]:
print(r.content[0].text)

To do well in Module 1, here are some general tips:

1. **Understand Key Concepts**: Familiarize yourself with the core concepts covered in the module. Focus on the basics of Docker and Terraform, as these are essential for the course.

2. **Hands-On Practice**: Set up Docker and try running sample applications. The more you practice, the more comfortable you will become with the tools.

3. **Troubleshooting**: Pay attention to common errors (like the `ModuleNotFoundError` for `psycopg2`). Make sure you know how to resolve these by installing the necessary packages.

4. **Ask Questions**: If you're stuck, don’t hesitate to ask questions in the forums or during live sessions. Engaging with fellow students and instructors can provide clarity.

5. **Use Resources Wisely**: Utilize any recommended readings, videos, or documentation provided in the course. They can offer deeper insights into the topics discussed.

6. **Stay Organized**: Keep your work organized. Document your learning and i

In [73]:
r.type

'message'

In [74]:
call.type

'function_call'

### Multiple calls

In [86]:
def do_call(tool_call_response):
    function_name = tool_call_response.name
    arguments = json.loads(tool_call_response.arguments)

    f = globals()[function_name]
    result = f(**arguments)

    return {
        "type": "function_call_output",
        "call_id": tool_call_response.call_id,
        "output": json.dumps(result, indent=2),
    }

In [76]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [77]:
response

Response(id='resp_68784232dca0819c9ea3abc4db0b546d005bae8ac02d6d62', created_at=1752711730.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_Wdb0aEYe97vxcqMwL5ScqcYd', name='search', type='function_call', id='fc_687842349574819c974eb8e1a0302d68005bae8ac02d6d62', status='completed'), ResponseFunctionToolCall(arguments='{"query":"module 1 tips for success"}', call_id='call_Z45OmEaFHdlPbGitqZrhwJMV', name='search', type='function_call', id='fc_68784234fe34819c9d2649972ffcb452005bae8ac02d6d62', status='completed'), ResponseFunctionToolCall(arguments='{"query":"study strategies for module 1"}', call_id='call_mclv98vemGIG5AhsoGhO4toQ', name='search', type='function_call', id='fc_687842355c0c819ca967ca4f37ffb2e8005bae8ac02d6d62', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[Fun

In [78]:
for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.text) 

function_call
function_call
function_call


In [79]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)
    print()

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.content[0].text) 

message

To do well in Module 1, here are some tips and strategies:

1. **Understanding Prerequisites**:
   - Make sure you have all the necessary tools and libraries installed, such as Docker and PostgreSQL.
   - It's essential to install the Python module `psycopg2`. You can do this using:
     ```bash
     pip install psycopg2-binary
     ```

2. **Handling Common Errors**:
   - If you encounter the error **ModuleNotFoundError: No module named 'psycopg2'**, ensure that you have installed the module correctly or update it:
     ```bash
     pip install psycopg2-binary --upgrade
     ```

3. **Follow the Course Content Diligently**:
   - Engage with the provided tutorials and resources, and follow the instructions carefully, especially regarding Docker configurations.

4. **Practice with Docker**:
   - Familiarize yourself with Docker commands like `docker-compose build` and `docker-compose run`. Running these commands correctly is vital for setting up your environment.

5. **Experime

### Putting it all together

Have two loops:

- First is the main Q&A loop - ask question, get back the answer
- Second is the request loop - send requests until there's a message reply from API

In [80]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

In [81]:
while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False
        
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                print('function_call:', entry)
                print()
                result = do_call(entry)
                chat_messages.append(result)
            elif entry.type == 'message':
                print(entry.content[0].text)
                print()
                has_messages = True

        if has_messages:
            break

 How do I do well in module 4?


function_call: ResponseFunctionToolCall(arguments='{"query":"module 4"}', call_id='call_6sxBYT0oBlPTWPmSX4K3cD6M', name='search', type='function_call', id='fc_6878425a46e8819fae7f78602c1c26b50baeeafc8a3abd7c', status='completed')

To do well in **Module 4: Analytics Engineering with dbt**, here are some key tips:

1. **Understand dbt Fundamentals**: Make sure you grasp the core concepts of dbt, including models, sources, and tests.

2. **Hands-on Practice**: Engage in practical exercises by working through examples provided in the course materials. This will help solidify your understanding.

3. **Utilize Docker**: If you're setting up dbt with Docker, ensure your environment is correctly configured. If you run into the error `ModuleNotFoundError: No module named 'pytz'`, you can resolve this by adding `RUN python -m pip install --no-cache pytz` in your Dockerfile.

4. **Collaborate and Discuss**: Join your peers in study groups or forums to discuss assignments and clarify doubts. This

 where can I get more hands on experience with dbt?


function_call: ResponseFunctionToolCall(arguments='{"query":"hands-on experience with dbt"}', call_id='call_wjJzOyPgaQhFd9PVRYhkzgzE', name='search', type='function_call', id='fc_687842777b20819fb06011a47646b95c0baeeafc8a3abd7c', status='completed')

To gain more hands-on experience with dbt, consider the following options:

1. **dbt Cloud**: Leverage dbt Cloud where you can create and manage your dbt projects. This platform provides built-in CI/CD workflows, making it easier to test and deploy models.

2. **Local Environment Setup**: Run dbt locally. Follow the documentation to set up dbt in your local environment. Use Docker if you prefer containerization.

3. **Project Work**: Engage in personal projects or contribute to open-source dbt projects. This not only helps in solidifying your knowledge, but also allows you to tackle real-world problems.

4. **Online Courses and Tutorials**: Look for online courses that feature practical assignments related to dbt. Websites like GitHub may 

 stop


Same using widgets

In [82]:
from IPython.display import display, HTML
import markdown # pip install markdown

In [88]:
def display_function_call(entry, result):
        call_html = f"""
            <details>
            <summary>Function call: <tt>{entry.name}({shorten(entry.arguments)})</tt></summary>
            <div>
                <b>Call</b>
                <pre>{entry}</pre>
            </div>
            <div>
                <b>Output</b>
                <pre>{result['output']}</pre>
            </div>
            
            </details>
        """
        display(HTML(call_html))

In [90]:
def shorten(text, max_length=50):
    if len(text) <= max_length:
        return text

    return text[:max_length - 3] + "..."

In [92]:
def display_response(entry):
        response_html = markdown.markdown(entry.content[0].text)
        html = f"""
            <div>
                <div><b>Assistant:</b></div>
                <div>{response_html}</div>
            </div>
        """
        display(HTML(html))

In [93]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

# Chat loop
while True:
    question = input()
    if question.strip().lower() == 'stop':
        print("Chat ended.")
        break
    print()

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True:  # inner request loop
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False

        for entry in response.output:
            chat_messages.append(entry)

            if entry.type == "function_call":
                result = do_call(entry)
                chat_messages.append(result)
                display_function_call(entry, result)

            elif entry.type == "message":
                display_response(entry)
                has_messages = True

        if has_messages:
            break

 How do I do well in module 4?





 what's the criteria to obtain course completion certificate?





 stop


Chat ended.


## Adding more tools

In [94]:
def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

In [95]:
add_entry_description = {
    "type": "function",
    "name": "add_entry",
    "description": "Add an entry to the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question to be added to the FAQ database",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question",
            }
        },
        "required": ["question", "answer"],
        "additionalProperties": False
    }
}

In [96]:
import chat_assistant

In [100]:
tools.add_tool(add_entry, add_entry_description)
tools.get_tools()

[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}},
 {'type': 'function',
  'name': 'add_entry',
  'description': 'Add an entry to the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'question': {'type': 'string',
     'description': 'The question to be added to the FAQ database'},
    'answer': {'type': 'string', 'description': 'The answer to the question'}},
   'required': ['question', 'answer'],
   'additionalProperties': False}}]

In [101]:
tools.get_tools()


[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}},
 {'type': 'function',
  'name': 'add_entry',
  'description': 'Add an entry to the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'question': {'type': 'string',
     'description': 'The question to be added to the FAQ database'},
    'answer': {'type': 'string', 'description': 'The answer to the question'}},
   'required': ['question', 'answer'],
   'additionalProperties': False}}]

In [102]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_interface = chat_assistant.ChatInterface()

chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client
)

In [103]:
chat.run()

You: How do I score the highest points to top the leaderbord?


You: How do I fail the module?


You: how to install docker in gentoo?


You: stop


Chat ended.


In [104]:
tools.add_tool(add_entry, add_entry_description)
tools.get_tools()

[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}},
 {'type': 'function',
  'name': 'add_entry',
  'description': 'Add an entry to the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'question': {'type': 'string',
     'description': 'The question to be added to the FAQ database'},
    'answer': {'type': 'string', 'description': 'The answer to the question'}},
   'required': ['question', 'answer'],
   'additionalProperties': False}}]

In [107]:
tools.get_tools()

[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}},
 {'type': 'function',
  'name': 'add_entry',
  'description': 'Add an entry to the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'question': {'type': 'string',
     'description': 'The question to be added to the FAQ database'},
    'answer': {'type': 'string', 'description': 'The answer to the question'}},
   'required': ['question', 'answer'],
   'additionalProperties': False}}]

In [105]:
chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client
)

In [108]:
chat.run()

You: how can i install kubernetes on ubuntu?


You: Add this info to the index


You: stop


Chat ended.


In [109]:
index.docs[-1]

{'question': 'How can I install Kubernetes on Ubuntu?',
 'text': 'To install Kubernetes on Ubuntu, follow these steps:\n\n1. **Set Up the Environment**: Ensure Docker is installed:\n   ```bash\n   sudo snap install docker\n   ```\n\n2. **Add Kubernetes APT Repository**: Import the Google Cloud public signing key:\n   ```bash\n   curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -\n   ```\n   Add the Kubernetes APT repository:\n   ```bash\n   sudo sh -c \'echo "deb http://apt.kubernetes.io/ kubernetes-xenial main" > /etc/apt/sources.list.d/kubernetes.list\'\n   ```\n\n3. **Install Kubernetes**: Update package list and install:\n   ```bash\n   sudo apt-get update\n   sudo apt-get install -y kubelet kubeadm kubectl\n   ```\n\n4. **Hold the Version**: Prevent automatic updates:\n   ```bash\n   sudo apt-mark hold kubelet kubeadm kubectl\n   ```\n\n5. **Initialize Kubernetes Cluster**: Initialize the cluster:\n   ```bash\n   sudo kubeadm init\n   ```\n   Follow

In [110]:
pip install pydantic-ai

Note: you may need to restart the kernel to use updated packages.


In [111]:
from pydantic_ai import Agent, RunContext

In [112]:
chat_agent = Agent(  
    'openai:gpt-4o-mini',
    system_prompt=developer_prompt
)

In [113]:
from typing import Dict

In [114]:
@chat_agent.tool
def search_tool(ctx: RunContext, query: str) -> Dict[str, str]:
    """
    Search the FAQ for relevant entries matching the query.

    Parameters
    ----------
    query : str
        The search query string provided by the user.

    Returns
    -------
    list
        A list of search results (up to 5), each containing relevance information 
        and associated output IDs.
    """
    print(f"search('{query}')")
    return search(query)

In [115]:
@chat_agent.tool
def add_entry_tool(ctx: RunContext, question: str, answer: str) -> None:
    """
    Add a new question-answer entry to FAQ.

    This function creates a document with the given question and answer, 
    tagging it as user-added content.

    Parameters
    ----------
    question : str
        The question text to be added to the index.

    answer : str
        The answer or explanation corresponding to the question.

    Returns
    -------
    None
    """
    return add_entry(question, answer)

In [116]:
user_prompt = "I just discovered the course. Can I join now?"
agent_run = await chat_agent.run(user_prompt)
print(agent_run.output)

search('Can I join the course now?')
Yes, you can still join the course even if it has already started. You will be eligible to submit homework assignments, but keep in mind that there are deadlines for turning in final projects, so it's best not to leave anything to the last minute.

Would you like more details about the specific deadlines or requirements for the course?
