In [None]:
from tqdm.auto import tqdm


In [None]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [None]:
documents[10]


In [None]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [None]:
ground_truth[10]

In [None]:
doc_idx = {d['id']: d for d in documents}
doc_idx['5170565b']['text']

In [None]:
doc_idx.keys()


In [None]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [None]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

# Check if the index already exists
if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)
    print(f"Index '{index_name}' created successfully.")

    
    # Filling index
    for doc in tqdm(documents):
        question = doc['question']
        text = doc['text']
        doc['question_text_vector'] = model.encode(question + ' ' + text)

        es_client.index(index=index_name, document=doc)

else:
    print(f"Index '{index_name}' already exists.")

In [None]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [None]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
from openai import OpenAI

client = OpenAI()

def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
# previously: rag(query: str) -> str
def rag(query: dict, model='gpt-4o') -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)

    return answer

In [None]:
ground_truth[10]

In [None]:
rag(ground_truth[10])


In [None]:
doc_idx['5170565b']['text']

In [None]:
answer_llm = 'Yes, sessions are recorded if you miss one. You can access the recordings and catch up on any missed content. Additionally, you can ask your questions in advance for office hours, which are also recorded, and engage through Slack.'
answer_orig = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

v_llm.dot(v_orig)

In [None]:
answers = {}


In [None]:
count = 0

for i, rec in enumerate(tqdm(ground_truth)):
    if count == 10:
        break

    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],

    }

    count += 1

In [None]:
answers


In [None]:
results_gpt4o = [None] * 10

for i, val in answers.items():
    results_gpt4o[i] = val.copy()
    results_gpt4o[i].update(ground_truth[i])

In [None]:
results_gpt4o[:3]


In [None]:

import pandas as pd


In [None]:
df_gpt4o = pd.DataFrame(results_gpt4o)


In [None]:
df_gpt4o.head()


In [None]:
!mkdir data


In [None]:
df_gpt4o.to_csv('data/results-gpt4o.csv', index=False)


In [None]:
rag(ground_truth[10], model='gpt-3.5-turbo')


In [None]:
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(max_workers=6)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [None]:
def process_record(rec):
    model = 'gpt-3.5-turbo'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

In [None]:
process_record(ground_truth[10])


In [None]:
results_gpt35 = map_progress(pool, ground_truth[:10], process_record)


In [None]:
df_gpt35 = pd.DataFrame(results_gpt35)
df_gpt35.to_csv('data/results-gpt35.csv', index=False)

In [None]:
df_gpt4o = pd.read_csv('data/results-gpt4o.csv')


In [None]:
results_gpt4o = df_gpt4o.to_dict(orient='records')


In [None]:
record = results_gpt4o[0]
record

In [None]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [None]:
similarity = []

for record in tqdm(results_gpt4o):
    sim = compute_similarity(record)
    similarity.append(sim)

In [None]:
df_gpt4o['cosine'] = similarity
df_gpt4o['cosine'].describe()

In [None]:
df_gpt35 = pd.read_csv('data/results-gpt35.csv')


In [None]:
results_gpt35 = df_gpt35.to_dict(orient='records')

similarity_35 = []

for record in tqdm(results_gpt35):
    sim = compute_similarity(record)
    similarity_35.append(sim)

In [None]:
df_gpt35['cosine'] = similarity_35
df_gpt35['cosine'].describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# sns.distplot(df_gpt35['cosine'], label='3.5')

sns.histplot(df_gpt4o['cosine'], label='GPT-4o', kde=True)
sns.histplot(df_gpt35['cosine'], label='GPT3.5', kde=True)

plt.title("RAG LLM performance")
plt.xlabel("Cosine Similarity")
plt.legend()

plt.show()

In [None]:
df_gpt4o_mini = pd.read_csv('data/results-gpt4o-mini.csv')


In [None]:
df_gpt4o_mini.head()


In [None]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [None]:
df_sample = df_gpt4o_mini.sample(n=150, random_state=1)


In [None]:
samples = df_sample.to_dict(orient='records')


In [None]:
record = samples[0]
record

In [None]:
prompt = prompt1_template.format(**record)
print(prompt)

In [None]:
answer = llm(prompt, model='gpt-4o-mini')


In [None]:
import json


In [None]:
evaluations = []
count = 0

for record in tqdm(samples):
    count += 1

    if count == 10:
        break 
    
    prompt = prompt1_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations.append(evaluation)

In [None]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    json_eval = json.loads(str_eval)
    json_evaluations.append(json_eval)

In [None]:
df_evaluations = pd.DataFrame(json_evaluations)


In [None]:
df_evaluations.Relevance.value_counts()


In [None]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT'] #.to_dict(orient='records')


In [None]:
samples[4]


In [None]:
prompt = prompt2_template.format(**record)
print(prompt)

In [None]:
evaluation = llm(prompt, model='gpt-4o-mini')
print(evaluation)

In [None]:
evaluations_2 = []
count = 0

for record in tqdm(samples):
    count += 1

    if count == 10:
        break

    prompt = prompt2_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations_2.append(evaluation)

In [None]:
json_evaluations_2 = []

for i, str_eval in enumerate(evaluations_2):
    json_eval = json.loads(str_eval)
    json_evaluations_2.append(json_eval)

In [None]:
df_evaluations_2 = pd.DataFrame(json_evaluations_2)


In [None]:
df_evaluations_2.Relevance.value_counts()


In [None]:
df_evaluations_2[df_evaluations_2.Relevance == 'PARTLY_RELEVANT']['Explanation'].values


In [None]:
samples[4]
