In [1]:
import minsearch

In [2]:
import json

In [3]:
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [7]:
q = 'the course has already started, can I still enroll?'

In [8]:
index.fit(documents)

<minsearch.Index at 0x1e8d07a38f0>

In [9]:
#results

In [10]:
import os
import openai
from openai import OpenAI

In [11]:
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
client = OpenAI()

In [13]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages = [{"role" : "user" , "content" : q}]
)

In [14]:
response.choices[0].message.content

"Whether or not you can still enroll in a course that has already started depends on a few factors, such as the institution's policies, the specific course, and how far along the course is. \n\nHere are some steps you can take to find out:\n\n1. **Check the Course Website:** Some courses have detailed enrollment policies listed online. Look for information about late registration or enrollment.\n\n2. **Contact the Instructor:** Reach out to the course instructor via email or another method of communication provided by the institution. Explain your situation and ask if it's possible to join the course late.\n\n3. **Speak with Academic Advising:** Many institutions have academic advisors who can provide guidance on course enrollment policies and may be able to grant exceptions or point you toward other sections or alternative courses.\n\n4. **Check the Institution's Policies:** Look at the academic calendar and policies regarding late enrollment, add/drop periods, and deadlines.\n\n5. **

In [15]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}
    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
     )
    return results

In [16]:
def build_prompt(query,search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    CONTEXT:  {context}
    """.strip()

    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
    

In [17]:
def llm(prompt):
    response = client.chat.completions.create(
    model='gpt-4o',
    messages = [{"role" : "user" , "content" : prompt}]
    )
    return response.choices[0].message.content

In [18]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [20]:
query = 'how do I run kafka?'

In [21]:
rag(query)

'To run Kafka, locate the specific Java class (e.g., JsonProducer.java) in the project directory and execute the following command in the terminal:\n\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```'

In [22]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. However, keep in mind that there are deadlines for turning in the final projects, so it's important not to leave everything until the last minute."

In [23]:
from elasticsearch import Elasticsearch

In [24]:
es_client = Elasticsearch('http://localhost:9200')

In [25]:
es_client.info()

ObjectApiResponse({'name': 'de97d13c1bdc', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'uXTUVB2gSRyEspg50Oo-bA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [26]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [27]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████| 948/948 [00:49<00:00, 19.03it/s]


In [29]:
query = 'I just disovered the course. Can I still join it?'

In [30]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


In [31]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [32]:
rag(query)

'Yes, you can still join the course even if you discover it after the start date. You are eligible to submit the homework, but keep in mind the deadlines for turning in the final projects. So make sure not to leave everything for the last minute.'