### Evaluation Data

In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [10]:
# Check the data
print('Documents:',documents[0])
print('')
print('Df ground truth:', df_ground_truth.head(2))
print('')
print('ground_truth:', ground_truth[0])

Documents: {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp', 'id': 'c02e79ef'}

Df ground truth:                              question                     course  document
0         When does the course begin?  data-engineering-zoomcamp  c02e79ef
1  How can I get the course schedule?  data-engineering-zoomcamp  c02e79ef

ground_truth: {'question': 'When does the course begin?', 'course': 'data-engineering-zoomcamp', 'document': 'c0

In [16]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Q1. Minsearch text

  
`text_fields=["question", "section", "text"],`  
`keyword_fields=["course", "id"]`

`boost = {'question': 1.5, 'section': 0.1}`

In [19]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x2946b8586e0>

In [20]:
def search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [21]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:14<00:00, 323.21it/s]


In [23]:
print('Q1 - answer:', hit_rate(relevance_total))

Q1 - answer: 0.848714069591528


### Embeddings

In [24]:
from minsearch import VectorSearch

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [26]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

### Q2 - Vector Search for question

In [27]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x2946bb2bcb0>

In [36]:
def v_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = vindex.search(
        query_vector=query,
        filter_dict={'course': course},
        # boost_dict=boost,
        num_results=5
    )

    return results

In [38]:
v_relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    emb_vector = pipeline.transform([q['question']])
    results = v_search(query=emb_vector, course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    v_relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:06<00:00, 718.09it/s]


In [40]:
print('Q2 - answer:', mrr(v_relevance_total))

Q2 - answer: 0.3572833369353793


### Q3 - Vector search for question and answer

In [49]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

In [50]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [51]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x2946bf91a90>

In [52]:
v_relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    emb_vector = pipeline.transform([q['question']])
    results = v_search(query=emb_vector, course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    v_relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:11<00:00, 414.81it/s]


In [54]:
print('Q3 - answer:', hit_rate(v_relevance_total))

Q3 - answer: 0.8210503566025502


### Q4 - Qdrant

In [57]:
from qdrant_client import QdrantClient, models
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance
from fastembed import TextEmbedding
import json

In [59]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [None]:
# Define the collection name
collection_name = "zoomcamp-w3-q4"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [82]:
points = []
id = 0

for i, doc in enumerate(documents):
    t = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=t, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [83]:
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [84]:
for doc in documents:
    text = doc['question'] + ' ' + doc['text']
    point = models.PointStruct(
        id=id,
        vector=models.Document(text=text, model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload={
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            "doc_id": doc['id'],
        } #save all needed metadata fields
    )
    points.append(point)

    id += 1

In [85]:
def quadrant_search(query, course, limit=5):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( 
            text=query,
            model=model_handle
        ),
        limit=limit, # top closest matches
        with_payload=True, #to get metadata in the results,
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
    )

    return results

In [86]:
q_relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    query = q['question']
    course = q['course']
    results = quadrant_search(query, course)
    relevance = [d.payload["id"] == doc_id for d in results.points]
    q_relevance_total.append(relevance)

100%|██████████| 4627/4627 [01:10<00:00, 65.37it/s]


In [88]:
print('Q4 - answer:', mrr(q_relevance_total))

Q4 - answer: 0.008176644333981697


### Q5 - Average cosine

In [100]:
import numpy as np

In [91]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [92]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [93]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [94]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [96]:
a_llm = pipeline.transform(df_results.answer_llm)
a_orig = pipeline.transform(df_results.answer_orig)

In [98]:
len(a_orig)

1830

In [102]:
cosine(a_llm[0], a_orig[0])

np.float64(0.46352620160029984)

In [103]:
cosine_similarity = []

for a_llm, a_orig in zip(a_llm, a_orig):
    cos = cosine(a_llm, a_orig)
    cosine_similarity.append(cos)

In [110]:
print('Q5 - answer:', np.array(cosine_similarity).mean())

Q5 - answer: 0.8415841233490402


### Q6 - Rouge

In [112]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [118]:
r.answer_llm

"Yes, all sessions are recorded, so if you miss one, you won't miss anything. You can catch up on the content later. Additionally, you can submit your questions in advance for office hours, and those sessions are also recorded."

In [128]:
rouge_total = []
for i, row in df_results.iterrows():
    r =  rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    result = r['rouge-1']['f']
    rouge_total.append(result)

print('Q6 - answer:', np.array(rouge_total).mean())

Q6 - answer: 0.3516946452113943
