In [1]:
import os

from sympy.polys.polyconfig import query
!pip install minsearch

Collecting minsearch
  Downloading minsearch-0.0.3-py3-none-any.whl.metadata (6.1 kB)
Downloading minsearch-0.0.3-py3-none-any.whl (9.3 kB)
Installing collected packages: minsearch
Successfully installed minsearch-0.0.3


In [2]:
import minsearch
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
documents[1]

{'text': 'See DE zoomcamp 2025 pre-course Q&A\nTo get the most out of this course, you should have:\nBasic coding experience\nFamiliarity with SQL\nExperience with Python (helpful but not required)\nNo prior data engineering experience is necessary. See Readme on GitHub',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [8]:
q = 'the course has already started, can I still enroll?'
index.fit(documents)

<minsearch.minsearch.Index at 0x75a2540d5dc0>

In [9]:
boost = {"question": 3.0,
         "section": 0.5,
         "text": 1.0}
results = index.search(query=q,
             filter_dict={"course": "data-engineering-zoomcamp"},
             boost_dict=boost,
             num_results=5)

In [10]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homework.\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (

In [4]:
from openai import OpenAI

In [3]:
client = OpenAI()

NameError: name 'OpenAI' is not defined

In [None]:
os.environ

In [21]:
response = client.chat.completions.create(
    model='gpt-4.1',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

'It depends on the course and the institution offering it. Many courses allow late enrollment within a certain time frame, especially online courses or ones with self-paced modules. However, some courses may have strict deadlines and may not accept new students after the start date.\n\n**What to do:**\n- **Check the course website or platform** for enrollment policies.\n- **Contact the course administrator or instructor** to ask if late enrollment is possible.\n- **Be prepared** to catch up on missed material if you are allowed to enroll late.\n\nIf you share more details—such as the course name, provider, or platform—I can offer more specific advice!'

In [20]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [22]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [24]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4.1',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [27]:
query = 'how do I run kafka?'
# query = q
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [29]:
print(rag(query))

To run Kafka in the context of your course, follow these steps depending on whether you’re using Python or Java:

**For Python:**
1. **Set up a virtual environment:**
   - Create a virtual environment (do this only once):  
     `python -m venv env`
   - Activate the environment:  
     - On MacOS/Linux: `source env/bin/activate`
     - On Windows: `env\Scripts\activate`
   - Install dependencies:  
     `pip install -r ../requirements.txt`
   - If you see `ModuleNotFoundError: No module named 'kafka'`, make sure to use  
     `pip install kafka-python-ng` (as per the recommended library).

2. **Run your producer or consumer Python files while the virtual environment is active.**
   - Also, make sure all necessary Docker images (Kafka, Zookeeper, etc.) are up and running before running your Python files.

**For Java:**
- In your project directory, use the following command to run Kafka producer/consumer/KStreams, etc.:
  ```
  java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/mai

In [35]:
from elasticsearch import Elasticsearch

In [36]:
es_client = Elasticsearch('http://localhost:9200')

In [37]:
es_client.info()

ObjectApiResponse({'name': '80f9e0461d82', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'K--Di7DDRcO5HZx_ZbicIw', 'version': {'number': '9.0.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '0a58bc1dc7a4ae5412db66624aab968370bd44ce', 'build_date': '2025-05-28T10:06:37.834829258Z', 'build_snapshot': False, 'lucene_version': '10.1.0', 'minimum_wire_compatibility_version': '8.18.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [38]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [39]:
documents[0]

{'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When does the course start?',
 'course': 'data-engineering-zoomcamp'}

In [40]:
from tqdm.auto import tqdm

In [41]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/1122 [00:00<?, ?it/s]

In [42]:
query = 'I just discovered the course. Can I still join it?'

In [43]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


In [44]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [45]:
print(rag(query))

Yes, you can still join the course even if you discovered it after the start date. You are eligible to submit the homework without registering, but please make sure to meet the deadlines for homework and final project submissions. Don’t leave everything for the last minute!
