In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7f2331ae9750>

In [4]:
from openai import OpenAI

client = OpenAI(base_url="https://api.deepseek.com")

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [6]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [7]:
def llm(prompt):
    response = client.chat.completions.create(
        model='deepseek-chat',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [8]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [9]:
rag('how do I run kafka?')

'To run Kafka, follow the instructions based on your use case:\n\n1. **For Java Kafka (e.g., running a producer/consumer/KStreams in terminal):**  \n   Navigate to the project directory and execute:  \n   ```bash\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **For Python Kafka (e.g., running `producer.py`):**  \n   - Ensure Docker containers are running.  \n   - Set up a virtual environment (run once):  \n     ```bash\n     python -m venv env\n     source env/bin/activate  # On Windows: env\\Scripts\\activate\n     pip install -r ../requirements.txt\n     ```  \n   - Activate the virtual environment before running Python files:  \n     ```bash\n     source env/bin/activate\n     ```  \n   - If encountering `ModuleNotFoundError` for `kafka`, use:  \n     ```bash\n     pip install kafka-python-ng\n     ```  \n\n3. **For permission issues (e.g., `./build.sh: Permission denied`):**  \n   Run:  \n   ```bash\n   chmod +x bui

In [10]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. You are eligible to submit the homeworks, but be mindful of the deadlines for the final projects. The course materials will also remain available after the course finishes, allowing you to follow the course at your own pace. \n\nAdditionally, you can join the course's Slack channel for support and ask questions there, though it's recommended to search the channel and FAQ first for existing answers."