## Load the documents with ids

In [224]:
import json
with open ('documents-with-ids.json', 'r') as file:
    documents = json.load(file)

In [225]:
documents[10]

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.',
 'section': 'General course-related questions',
 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?',
 'course': 'data-engineering-zoomcamp',
 'id': 'ea739c65'}

## Load groud truth data

In [226]:
import pandas as pd
with open ('ground-truth-data.csv', 'r') as g_file:
    ground_truth_df = pd.read_csv(g_file)
    ground_truth_df = ground_truth_df[ground_truth_df['course'] == 'machine-learning-zoomcamp']
    ground_truth = ground_truth_df.to_dict(orient='records')

In [227]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

## To quickly retrieve the documents let's assign id directly

In [228]:
doc_id = {d['id'] : d for d in documents}

In [229]:
doc_id['ea739c65']['text']

'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.'

## Let's index the documents

In [230]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [231]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [232]:
es_client = Elasticsearch('http://localhost:9200') 

In [233]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

In [234]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [235]:
for doc in documents:
    question = doc['question']
    text = doc['text']
    
    doc['question_text_vector'] = model.encode(question + ' ' + text)
    es_client.index(index=index_name, document=doc)

## Retrieve the documents based on our query

In [236]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [237]:
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [238]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

In [239]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [240]:
from openai import OpenAI

client = OpenAI()

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [241]:
# previously: rag(query: str) -> str
def rag(query: dict) -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt)
    return answer

In [242]:
rag(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'))

'Yes, sessions are recorded, so if you miss one, you won’t miss anything. You can catch up on the recorded material, and you can also ask questions in advance for office hours, which will be covered during the live stream.'

## Test with questions which were generated by LLM in prev lessons 

In [243]:
ground_truth[1]

{'question': 'Can you provide a link to sign up?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [244]:
print(rag(ground_truth[1]))

You can sign up using the link in the course GitHub repository: [Sign Up Here](https://airtable.com/shryxwLd0COOEaqXo).


In [245]:
doc_id['c02e79ef']['text']

"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

## Cosine Similarity Metric to check how similar they are (1 question)

In [246]:
llm_response = """To receive course announcements, join the course Telegram channel with announcements and register in DataTalks.Club's Slack and join the channel"""

In [247]:
llm_response_v = model.encode(llm_response)

In [248]:
real_response = doc_id['c02e79ef']['text']

In [249]:
real_response_v = model.encode(real_response)

In [250]:
llm_response_v.dot(real_response_v)

0.7277642

## Cosine Similarity Metric to check how similar they are (all questions)

In [251]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [252]:
answers = {}

In [253]:
for i, record in enumerate(ground_truth):
    if i in answers:
        continue
    
    llm_answer = rag(record)

    document = doc_id[record['document']]
    real_answer = document['text']

    answers[i] = {
        'answer_llm': llm_answer,
        'real_answer': real_answer,
        'document_id': document['id'],
        'question': record['question'],
        'course': document['course']
    }
    
print('Done')

Done


In [254]:
len(answers)

1830

In [255]:
answers[0]

{'answer_llm': 'You can sign up for the course by going to the course page at http://mlzoomcamp.com/.',
 'real_answer': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'document_id': '0227b872',
 'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp'}

In [257]:
df_gpt4_o = pd.DataFrame(answers.values())

In [258]:
df_gpt4_o.sample(n = 5).to_dict(orient = 'records')[0]

{'answer_llm': 'Values tend to be close to the mean if they have a low standard deviation.',
 'real_answer': 'In statistics, the standard deviation is a measure of the amount of variation or dispersion of a set of values. A low standard deviation indicates that the values tend to be close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the values are spread out over a wider range. [Wikipedia] The formula to calculate standard deviation is:\n(Aadarsha Shrestha)',
 'document_id': '266faa6d',
 'question': 'Where do values tend to be if they have a low standard deviation?',
 'course': 'machine-learning-zoomcamp'}

In [260]:
df_gpt4_o.to_csv('results_df_gpt4_oturbo.csv', index = False)

## Compute Cosine Similarity

A - original answer;
Q - synthetically generated question;
A'- answer from the LLM

Cosine Similarity (A, and A')

In [261]:
import pandas as pd
pd.read_csv('results_df_gpt4_oturbo.csv')[:2]

Unnamed: 0,answer_llm,real_answer,document_id,question,course
0,You can sign up for the course by going to the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up by visiting the course GitHub ...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp


In [267]:
df = pd.read_csv('results_df_gpt4_oturbo.csv')

In [268]:
df = df[(df['answer_llm'] != 'NONE') & (df['course'] == 'machine-learning-zoomcamp')]

In [269]:
df['course'].unique()

array(['machine-learning-zoomcamp'], dtype=object)

In [270]:
results_gpt4_o = df.to_dict(orient = 'records')

In [271]:
results_gpt4_o[0]

{'answer_llm': 'You can sign up for the course by going to the course page at http://mlzoomcamp.com/.',
 'real_answer': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'document_id': '0227b872',
 'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp'}

In [272]:
import math 
def cosine_similarity(record):
    llm_ans = model.encode(record['answer_llm'])
    real_ans = model.encode(record['real_answer'])
    cos = llm_ans.dot(real_ans)
    return cos

In [273]:
results = []
for rec in results_gpt4_o:
    if cosine_similarity is not None: 
        results.append(cosine_similarity(rec))

In [274]:
df['cosine_sim'] = results

In [275]:
df[:10]

Unnamed: 0,answer_llm,real_answer,document_id,question,course,cosine_sim
0,You can sign up for the course by going to the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,0.416958
1,You can sign up by visiting the course GitHub ...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,0.368035
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,0.710604
3,"Based on the provided context, there is no men...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp,-0.032536
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp,0.408523
5,"The course videos are pre-recorded, so you can...","The course videos are pre-recorded, you can st...",39fda9f0,Are the course videos live or pre-recorded?,machine-learning-zoomcamp,0.763997
6,You can start watching the course videos right...,"The course videos are pre-recorded, you can st...",39fda9f0,When can I start watching the course videos?,machine-learning-zoomcamp,0.778924
7,"Yes, the live office hours sessions are recorded.","The course videos are pre-recorded, you can st...",39fda9f0,Are the live office hours sessions recorded?,machine-learning-zoomcamp,0.571757
8,You can find the office hours sessions recorde...,"The course videos are pre-recorded, you can st...",39fda9f0,Where can I find the office hours sessions?,machine-learning-zoomcamp,0.761989
9,You can access the pre-recorded course videos ...,"The course videos are pre-recorded, you can st...",39fda9f0,Where can I access the pre-recorded course vid...,machine-learning-zoomcamp,0.694365


In [276]:
df['cosine_sim'].describe()

count    1830.000000
mean        0.680557
std         0.217157
min        -0.147974
25%         0.591140
50%         0.735822
75%         0.835435
max         0.987929
Name: cosine_sim, dtype: float64