In [52]:
import minsearch
import json
from openai import OpenAI
from dotenv import load_dotenv
import os
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [53]:
with open("documents.json","rt") as f_in : 
    docs_raw = json.load(f_in)

In [54]:
documents = []

for course in docs_raw : 
    for doc in course["documents"]:
        doc['course'] = course['course']
        documents.append(doc)

In [55]:
index = minsearch.Index(

    text_fields=['question','text','section'],
    keyword_fields= ['course']
)

index.fit(documents)

<minsearch.Index at 0x71ae7239fbf0>

In [56]:
def search(query): 
    boost = {'question':3.0,'section':0.5}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=10,
        filter_dict={'course' : 'data-engineering-zoomcamp'}
    )

    return results 

In [57]:
def build_query(query,search_results):

    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
        
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query,context=context).strip() 

    return prompt

In [58]:
def LLM(prompt):
    #configuration
    load_dotenv()
    key = os.getenv('OPENAI_API_KEY')
    client = OpenAI(api_key=key)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{'role':'user','content':prompt}]
    )

    return response.choices[0].message.content

In [59]:
def rag(query):
    search_results = search(query=query)
    prompt = build_query(query=query,search_results=search_results)
    answer = LLM(prompt=prompt)
    return answer

In [60]:
query = 'should I know python before ?'
answer = rag(query=query)
print(answer)

Yes, you should know Python before the course starts. It is one of the prerequisites along with other tools and technologies mentioned in the FAQ.


In [61]:
es_client = Elasticsearch(hosts="http://localhost:9200")

In [62]:
index_settings = {
    
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'
es_client.indices.create(index=index_name,body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/T9XdkSiLSuWiusYdCTHtig] already exists')

In [63]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [64]:
def elastic_search(query):
    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    es_response = es_client.search(index=index_name, body=search_query)
    es_search_result = []

    for hit in es_response['hits']['hits']:
        es_search_result.append(hit['_source'])
    return es_search_result

In [65]:
def rag(query):
    search_results = elastic_search(query=query)
    prompt = build_query(query=query,search_results=search_results)
    answer = LLM(prompt=prompt)
    return answer

In [66]:
query = 'should I know python before ?'
answer = rag(query=query)
print(answer)

Yes, you should know Python before the course starts, as it is listed among the prerequisites and requirements that you should install and set up, specifically Python 3 (installed with Anaconda).
