In [1]:
import requests
import json

from minsearch import AppendableIndex

from utilities import chat_assistant


In [2]:
question_1 = "Can I still join the course?"
question_2 = "How do I patch KDE under FreeBSD?"
question_3 = "How can I run Docker on Windows 10?"
question_4 = 'how do I do well on module 1?'

In [3]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
from minsearch import AppendableIndex

index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x760aa0796a20>

In [6]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [7]:


prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt



In [8]:
search_results = search(question_1)

prompt = build_prompt(question_1, search_results)

print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
Can I still join the course?
</QUESTION>

<CONTEXT>
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Certificate - Can I follow the course in a self-paced mode and get a certificate?
answer: No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.

section: General

In [9]:
from openai import OpenAI
client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [10]:
answer = llm(prompt)

print(answer)

Yes, you can still join the course after the start date. Even if you don't register, you're eligible to submit the homeworks, but keep in mind that there will be deadlines for turning in the final projects, so it’s best not to leave everything until the last minute.


In [11]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [12]:
rag(question_2)

"I'm sorry, but there is no information provided in the CONTEXT regarding how to patch KDE under FreeBSD."

In [13]:
print(llm(question_2))

Patching KDE under FreeBSD generally involves updating the software to a newer version, applying a bug fix, or modifying the source code for your specific needs. Here’s a general guide on how to patch KDE under FreeBSD:

### 1. **Install Necessary Tools**
Before you begin, ensure that you have the necessary tools installed. You'll need `ports`, `pkg`, and possibly `git` to get the source code.

```bash
pkg install git
pkg install portsnap
```

### 2. **Update Ports Tree**
Update your ports tree to ensure you have the latest version of KDE and its dependencies.

```bash
portsnap fetch update
```

### 3. **Fetch the KDE Port**
Navigate to the KDE port directory you want to patch. For example, if you want to patch `kde5`, you can find it in:

```bash
cd /usr/ports/x11/kde5
```

### 4. **Obtain the Source Code**
If you need to download the source code for KDE, you can typically do this using the `make` command.

```bash
make fetch
```

### 5. **Apply the Patch**
If you have a specific patc

## Agentic RAG

In [14]:


prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()



In [15]:
question = question_1
context = 'EMPTY'

prompt = prompt_template.format(question=question, context=context)
answer_json = llm(prompt)

answer = json.loads(answer_json)

In [16]:
print(answer)

{'action': 'SEARCH', 'reasoning': 'The question about joining the course could be addressed in our FAQ database, which likely contains information about enrollment deadlines and procedures.'}


In [17]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    return context.strip()

In [18]:
search_results = search(question)
context = build_context(search_results)
print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Certificate - Can I follow the course in a self-paced mode and get a certificate?
answer: No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You c

In [19]:
prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
Can I still join the course?
</QUESTION>

<CONTEXT> 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Certificate - Can I follow the course in a self-paced mode and get a certificate?
answer: No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the cour

In [20]:
answer_json = llm(prompt)

print(answer_json)

{
"action": "ANSWER",
"answer": "Yes, you can still join the course even if it has already started. You will be able to submit assignments and participate in the course activities, but remember to meet the deadlines for any final projects.",
"source": "CONTEXT"
}


## Agentic Search

In [21]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

In [22]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.

Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}

<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [23]:
max_iterations = 3
iteration_number = 1
search_queries = []
search_results  = []
previous_actions = []

In [24]:
context = build_context(search_results)

prompt = prompt_template.format(
    question=question_4,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number
)

In [25]:
answer_json = llm(prompt)
answer = json.loads(answer_json)
print(answer)

{'action': 'SEARCH', 'reasoning': 'To provide a detailed response on how to succeed in Module 1, it would be beneficial to find specific tips or strategies that are offered in the FAQ. This could include study techniques, resources, or common pitfalls to avoid.', 'keywords': ['how to succeed in Module 1', 'tips for Module 1', 'strategies for doing well in Module 1']}


In [26]:
previous_actions.append(answer)
keywords = answer['keywords']
for kw in keywords:
    search_queries.append(kw)
    sr = search(kw)
    search_results.extend(sr)


search_results = dedup(search_results)

iteration_number = 2

context = build_context(search_results)

prompt = prompt_template.format(
    question=question_4,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number
)

answer_json = llm(prompt)
answer = json.loads(answer_json)
print(answer)

{'action': 'SEARCH', 'reasoning': 'To gather more information specifically related to effective strategies and resources for Module 1, which focuses on Docker and Terraform. This will help in providing targeted advice to the student on succeeding in this module.', 'keywords': ['effective study techniques for Module 1', 'succeeding in Docker and Terraform', 'Module 1 study resources']}


In [27]:
question = question_4

search_queries = []
search_results = []
previous_actions = []

iteration = 0

while True:
    print(f'* ITERATION #{iteration}...')

    context = build_context(search_results)
    prompt = prompt_template.format(
        question=question,
        context=context,
        search_queries="\n".join(search_queries),
        previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
        max_iterations=3,
        iteration_number=iteration
    )
    print(f'** PROMPT #{iteration}')
    print(prompt)

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(f'** RESPONSE #{iteration}')
    print(json.dumps(answer, indent=2))

    previous_actions.append(answer)

    action = answer['action']
    if action != 'SEARCH':
        break

    keywords = answer['keywords']
    search_queries = list(set(search_queries) | set(keywords))
    
    for k in keywords:
        res = search(k)
        search_results.extend(res)

    search_results = dedup(search_results)
    
    iteration = iteration + 1
    if iteration >= 4:
        break

    print()

* ITERATION #0...
** PROMPT #0
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student questio

## Function Calling (or "tool use")

In [28]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [29]:
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}

In [30]:
def do_call(tool_call_response):
    function_name = tool_call_response.name
    arguments = json.loads(tool_call_response.arguments)

    f = globals()[function_name]
    result = f(**arguments)

    return {
        "type": "function_call_output",
        "call_id": tool_call_response.call_id,
        "output": json.dumps(result, indent=2),
    }

In [31]:
question = question_4

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)
response.output

[ResponseFunctionToolCall(arguments='{"query":"how to do well on module 1"}', call_id='call_Seeo9KH6EuRIC0JkxBxP1aYj', name='search', type='function_call', id='fc_68be0b856e4c81918242f4d36e13920a0dfa40e5941e73fc', status='completed'),
 ResponseFunctionToolCall(arguments='{"query":"tips for success in module 1"}', call_id='call_g8VrebUiBLPd8lGZuQjtRHm7', name='search', type='function_call', id='fc_68be0b859dc48191935c816988a497dd0dfa40e5941e73fc', status='completed'),
 ResponseFunctionToolCall(arguments='{"query":"module 1 study strategies"}', call_id='call_7qjDa3zLKGdsRRUWwWjcbuP5', name='search', type='function_call', id='fc_68be0b85d334819184c953518793409a0dfa40e5941e73fc', status='completed')]

In [32]:
calls = response.output

for call in calls:
    result = do_call(call)
    chat_messages.append(call)
    chat_messages.append(result)

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)
response.output

[ResponseOutputMessage(id='msg_68be0b86ec588191ade3ead83ed65c5a0dfa40e5941e73fc', content=[ResponseOutputText(annotations=[], text="To do well on Module 1 of the Data Engineering Zoomcamp, here are some tips and strategies:\n\n1. **Understand the Key Concepts**:\n   - Focus on the core topics covered in the module, which include Docker and Terraform. Make sure you understand how to set up and work with these tools.\n\n2. **Practice Hands-On**:\n   - Set up your environment by following the provided instructions carefully. If you're using Docker, for example, familiarize yourself with commands like `docker-compose build` and `docker run`.\n\n3. **Code Examples**:\n   - Often, you'll find that coding examples help solidify your understanding. Try creating simple projects or examples based on the lecture content.\n\n4. **Troubleshooting**:\n   - Be prepared to encounter some errors. Understanding common issues, such as `ModuleNotFoundError` or issues related to SQLAlchemy, can be very hel

In [33]:
chat_messages

[{'role': 'developer',
  'content': "You're a course teaching assistant. \nYou're given a question from a course student and your task is to answer it.\nIf you look up something in FAQ, convert the student question into multiple queries."},
 {'role': 'user', 'content': 'how do I do well on module 1?'},
 ResponseFunctionToolCall(arguments='{"query":"how to do well on module 1"}', call_id='call_Seeo9KH6EuRIC0JkxBxP1aYj', name='search', type='function_call', id='fc_68be0b856e4c81918242f4d36e13920a0dfa40e5941e73fc', status='completed'),
 {'type': 'function_call_output',
  'call_id': 'call_Seeo9KH6EuRIC0JkxBxP1aYj',
  'output': '[\n  {\n    "text": "Even after installing pyspark correctly on linux machine (VM ) as per course instructions, faced a module not found error in jupyter notebook .\\nThe solution which worked for me(use following in jupyter notebook) :\\n!pip install findspark\\nimport findspark\\nfindspark.init()\\nThereafter , import pyspark and create spark contex<<t as usual\\n

In [34]:
for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.content[0].text) 

message
To do well on Module 1 of the Data Engineering Zoomcamp, here are some tips and strategies:

1. **Understand the Key Concepts**:
   - Focus on the core topics covered in the module, which include Docker and Terraform. Make sure you understand how to set up and work with these tools.

2. **Practice Hands-On**:
   - Set up your environment by following the provided instructions carefully. If you're using Docker, for example, familiarize yourself with commands like `docker-compose build` and `docker run`.

3. **Code Examples**:
   - Often, you'll find that coding examples help solidify your understanding. Try creating simple projects or examples based on the lecture content.

4. **Troubleshooting**:
   - Be prepared to encounter some errors. Understanding common issues, such as `ModuleNotFoundError` or issues related to SQLAlchemy, can be very helpful. For instance:
     - If you run into `ModuleNotFoundError: No module named 'psycopg2'`, you can resolve it by installing the packa

In [35]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

In [36]:
while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False
        
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                print('function_call:', entry)
                print()
                result = do_call(entry)
                chat_messages.append(result)
            elif entry.type == 'message':
                print(entry.content[0].text)
                print()
                has_messages = True

        if has_messages:
            break

function_call: ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_UAx5jiy5DjhDLZee8fabLvHg', name='search', type='function_call', id='fc_68be0b9bb71881a18da6746bcf67b2a90f45564af31a0632', status='completed')

function_call: ResponseFunctionToolCall(arguments='{"query":"tips for success in module 1"}', call_id='call_KZZDvQ2gs3qIRL0K5adtdOrx', name='search', type='function_call', id='fc_68be0b9c845c81a194b005392fd989920f45564af31a0632', status='completed')

To excel in Module 1 of your course, here are some tips that may help based on various experiences shared by students:

1. **Understand Prerequisites**: Make sure you have a solid understanding of the necessary foundational concepts, like Docker and SQLAlchemy, as they're crucial for your success in this module.

2. **Environment Setup**: 
   - Ensure you correctly set up your environment, including all necessary installations. For example, if using SQLAlchemy with PostgreSQL, ensure you have th

## Multiple Tools

In [37]:
def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

In [38]:
add_entry_description = {
    "type": "function",
    "name": "add_entry",
    "description": "Add an entry to the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question to be added to the FAQ database",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question",
            }
        },
        "required": ["question", "answer"],
        "additionalProperties": False
    }
}

In [39]:
tools = chat_assistant.Tools()
tools.add_tool(search, search_tool)
tools.add_tool(add_entry, add_entry_description)
tools.get_tools()

[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}},
 {'type': 'function',
  'name': 'add_entry',
  'description': 'Add an entry to the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'question': {'type': 'string',
     'description': 'The question to be added to the FAQ database'},
    'answer': {'type': 'string', 'description': 'The answer to the question'}},
   'required': ['question', 'answer'],
   'additionalProperties': False}}]

In [40]:



developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_interface = chat_assistant.ChatInterface()

chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client
)



In [41]:
chat.run()

Chat ended.
