In [1]:
import json
import minsearch

In [2]:
with open("documents.json", "r") as f:
    documents_raw = json.load(f)

In [3]:
documents = []

for course_data in documents_raw:
    for document in course_data['documents']:
        document["course"] = course_data["course"]
        documents.append(document)

In [4]:
query = "the course has already started, can I still enroll?"

In [5]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the `QUESTION` based on the `CONTEXT`.
Use only the facts from the CONTEXT when answering the QUESTION.

# QUESTION:
{question}

# CONTEXT:
{context}
"""
    result_texts = [f"section: {result["section"]}\nquestion: {result["question"]}\nanswer: {result["text"]}\n" for result in search_results]
    context = "\n".join(result_texts)
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
    

In [6]:
from elasticsearch import Elasticsearch

In [7]:
es_client = Elasticsearch("http://localhost:9200")

In [8]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = "course-questions"
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [9]:
from tqdm import tqdm

In [10]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████████████| 948/948 [00:15<00:00, 61.18it/s]


In [11]:
query = "I just discovered the course. Can I still join?"

def elastic_search(query):
    es_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=es_query)
    result_docs = []
    for result in response['hits']['hits']:
        result_docs.append(result['_source'])
    return result_docs 

In [12]:
elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [13]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [15]:
from openai import OpenAI

In [20]:
client = OpenAI(
    base_url="http://localhost:11434/v1/",
    api_key="ollama"
)

In [23]:
def llm(prompt):
    response = client.chat.completions.create(
        model="qwen2:0.5b",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [31]:
query

'I just discovered the course. Can I still join?'

In [35]:
rag(query)

'Once you register for the Data Engineering Bootcamp, it will be indicated as "accepted" on ZoomCampQABot\'s registration list. Therefore, there isn\'t anything else required except accepting the data engineering boot camp offer and starting learning without registering, so the course could still begin soon enough to accept participants like you.'