In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py

--2025-06-09 11:40:53--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4014 (3.9K) [text/plain]
Saving to: ‘minsearch.py’


2025-06-09 11:40:53 (8.29 MB/s) - ‘minsearch.py’ saved [4014/4014]



In [1]:
import minsearch
import json



In [3]:
with open('documents.json', 'rb') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When does the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(text_fields=["question", "text", "section"],
                keyword_fields=["course"])

In [7]:
q = 'the course has already started, can I still join it?' # we want to search text/question/section to potentially find the answer

In [7]:
index.fit(documents)

<minsearch.Index at 0x7df72ea13810>

In [9]:
boost = {'question': 3.0,
         'section': 0.5,
         } # if we want to boost some parts, we define it - in our example, question is 3x more important if contains key words e.g. course/started/enroll
filter = {'course': 'data-engineering-zoomcamp'}

results = index.search(
    query=q,
    boost_dict=boost,
    num_results=5,
    filter_dict=filter
)

In [10]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homework.\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked tec

In [8]:
from openai import OpenAI
import os 

In [10]:
client = OpenAI(api_key=API_KEY)

In [23]:
response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{'role':'user', 'content':q}]
)


In [24]:
print(response.choices[0].message.content)

Whether you can still join a course that has already started depends on the specific policies of the institution or organization offering the course. Here are some steps you can take:

1. **Check the Course Policies**: Look for information on the course's website or syllabus regarding late enrollment.

2. **Contact the Instructor**: Reach out to the instructor or course coordinator. They may consider making exceptions on a case-by-case basis.

3. **Assess the Course Format**: If the course is mostly self-paced, joining late may be more feasible than in a structured, schedule-driven course.

4. **Consider Your Commitment**: Make sure you are comfortable with catching up on missed content before joining.

5. **Look for Alternatives**: If joining this particular course is not possible, consider looking for other sessions or related courses that might be available.

If you're interested, it's best to inquire as soon as possible!


### Clean the code

In [29]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [30]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [34]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [35]:
query = 'how do I run kafka?'

rag(query)

'To run Kafka, follow these steps based on the context provided:\n\n1. **For Java Kafka**: In your project directory, run the following command in the terminal:\n   ```\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **For Python Kafka**: First, ensure you have the necessary Kafka packages. It is recommended to create a virtual environment and install the required packages by running:\n   ```\n   python -m venv env\n   source env/bin/activate  # For MacOS/Linux\n   # OR\n   env\\Scripts\\activate  # For Windows\n   pip install -r ../requirements.txt\n   ```\n\n3. Remember to deactivate the virtual environment when you are done by using the command `deactivate`.\n\nMake sure all Docker images are running if you are using a Docker setup.'

### Elasticsearch

Code to run docker with ElasticSearch:

docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:9.0.1

In [11]:
from elasticsearch import Elasticsearch

In [20]:
es_client = Elasticsearch('http://localhost:9400')

In [22]:
es_client.info()

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: ProtocolError(('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))))

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
es_client.indices.create(index=index_name, body=index_settings)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7df72c45e390>: Failed to establish a new connection: [Errno 111] Connection refused))

In [None]:
documents[0]

In [None]:
from tqdm.auto import tqdm

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

In [None]:
query = 'I just disovered the course. Can I still join it?'

In [None]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [None]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag(query)