## Load the documents with ids

In [22]:
import json
with open ('documents-with-id.json', 'r') as file:
    documents = json.load(file)

In [38]:
documents[6]

{'text': 'Yes. For the 2024 edition we are using Mage AI instead of Prefect and re-recorded the terraform videos, For 2023, we used Prefect instead of Airflow..',
 'section': 'General course-related questions',
 'question': 'Course - Is the current cohort going to be different from the previous cohort?',
 'course': 'data-engineering-zoomcamp',
 'id': 'f1d31564'}

## Load groud truth data

In [20]:
import pandas as pd
with open ('ground-truth-data.csv', 'r') as g_file:
    ground_truth_df = pd.read_csv(g_file)
    ground_truth_df = ground_truth_df.where(ground_truth_df['course'] == 'data-engineering-zoomcamp')
    ground_truth = ground_truth_df.to_dict(orient='records')

In [21]:
ground_truth[10]

{'question': 'Can I enroll in the course after it starts?',
 'course': 'data-engineering-zoomcamp',
 'document': '7842b56a'}

## To quickly retrieve the documents let's assign id directly

In [27]:
doc_id = {d['id'] : d for d in documents}

In [35]:
doc_id['f1d31564']['text']

'Yes. For the 2024 edition we are using Mage AI instead of Prefect and re-recorded the terraform videos, For 2023, we used Prefect instead of Airflow..'

## Let's index the documents

In [43]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [45]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [46]:
es_client = Elasticsearch('http://localhost:9200') 

In [47]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

In [48]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [50]:
for doc in documents:
    question = doc['question']
    text = doc['text']
    
    doc['question_text_vector'] = model.encode(question + ' ' + text)
    es_client.index(index=index_name, document=doc)

## Retrieve the documents based on our query

In [128]:
def elastic_search_knn(field, vector, course):
    knn = {
            "field": field,
            "query_vector": vector,
            "k": 3,
            "num_candidates": 10000,
            "filter": {
                "term": {
                    "course": course
                }
            }
        }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
        }    

    es_results = es_client.search(
        index=index_name,
        body=search_query
        )

    result_docs = []
    
    for res in es_results['hits']['hits']:
        result_docs.append(res['_source'])

    return result_docs

In [129]:
def question_text_vector_knn(q, course):
    q_encoded = model.encode(q)
    field = 'question_text_vector'
    
    search = elastic_search_knn(field, q_encoded, course)
    return search

In [130]:
question_text_vector_knn(q = 'Are sessions recorded if I miss one?', course='machine-learning-zoomcamp')

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '7dcaefe4'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '162728a4'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

In [142]:
def built_prompt(question, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION. If there is no answer, return NONE.
    
QUESTION: {question}
    
CONTEXT: 
{context}
""".strip()
    context = ''
    for doc in search_results:
        context = context + f'section: {doc['section']}\nquestion: {doc['question']}\ncontext: {doc['text']}\n\n'
    return prompt_template.format(question = question, context = context)

In [139]:
from openai import OpenAI

client = OpenAI()

def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [140]:
def rag(q, course):
    search_results = question_text_vector_knn(q, course) # get results from db
    prompt = built_prompt(question = q, search_results = search_results) # build prompt by results and question
    answer = llm(prompt)
    return answer

In [143]:
rag('How the project would be evaludated?', 'data-engineering-zoomcamp')

'NONE'