In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [3]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [4]:
doc_idx = {d['id']: d for d in documents}

In [5]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(query = q['question'], course = q['course'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


## Q1. Minsearch Text
## Index Data

In [6]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x77f816fcc8c0>

In [7]:
boost = {'question': 1.5, 'section': 0.1}

def minsearch_search(query, course):
    results = index.search(
        query=query, 
        boost_dict=boost,
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [8]:
evaluate(ground_truth, minsearch_search)

100%|████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:13<00:00, 337.64it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

## Embeddings

In [9]:
from minsearch import VectorSearch

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [11]:
# Create Embeddings for the "question" field
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

## Q2. Vector search for question

In [12]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x77f816d8ecc0>

In [13]:
from tqdm.auto import tqdm

def minsearch_search_fn(query, course, k=5):
    """
    Embed the query with the SAME fitted pipeline and search in Minsearch with a course filter.
    """
    # TF-IDF expects an iterable -> wrap in a list
    q_mat = pipeline.transform([query])       # shape: (1, n_components)
    # Make a 1D vector
    q_vec = q_mat.toarray()[0] if hasattr(q_mat, "toarray") else q_mat[0]

    return vindex.search(q_vec, filter_dict={'course': course}, num_results=k)

def mrr_at_k(ground_truth, k=5):
    """
    Compute Mean Reciprocal Rank@k.
    Assumes each ground_truth item has keys: 'question', 'course', 'document' (relevant doc id).
    """
    total_rr = 0.0
    n = 0

    for q in tqdm(ground_truth):
        results = minsearch_search_fn(q['question'], q['course'], k=k)
        # Find rank (1-based) of the relevant document
        rank = None
        for i, d in enumerate(results):
            if d.get('id') == q['document']:
                rank = i + 1
                break
        if rank is not None:
            total_rr += 1.0 / rank  # reciprocal rank (0 if not found)
        n += 1

    return total_rr / n if n else 0.0

# --- Run it ---
mrr5 = mrr_at_k(ground_truth, k=5)
print({"MRR@5": mrr5})


100%|████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:06<00:00, 731.13it/s]

{'MRR@5': 0.3568510914199265}





## Q3. Vector search for questions and answers

In [14]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [15]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x77f81513efc0>

In [16]:
from tqdm.auto import tqdm

def minsearch_search_fn(query, course, k=5):
    """
    Embed the query with the SAME fitted pipeline and search in Minsearch with a course filter.
    """
    q_mat = pipeline.transform([query])       # shape: (1, n_components)
    q_vec = q_mat.toarray()[0] if hasattr(q_mat, "toarray") else q_mat[0]

    return vindex.search(q_vec, filter_dict={'course': course}, num_results=k)

def evaluate_at_k(ground_truth, k=5):
    """
    Compute MRR@k and Hit Rate@k.
    """
    total_rr = 0.0
    hits = 0
    n = 0

    for q in tqdm(ground_truth):
        results = minsearch_search_fn(q['question'], q['course'], k=k)

        rank = None
        for i, d in enumerate(results):
            if d.get('id') == q['document']:
                rank = i + 1
                break

        if rank is not None:
            total_rr += 1.0 / rank      # reciprocal rank
            hits += 1                   # count a hit if relevant doc is in top-k
        n += 1

    return {
        f'MRR@{k}': total_rr / n if n else 0.0,
        f'HitRate@{k}': hits / n if n else 0.0
    }

# --- Run it ---
metrics_k5 = evaluate_at_k(ground_truth, k=5)
print(metrics_k5)


100%|████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:06<00:00, 699.63it/s]

{'MRR@5': 0.6711944384410349, 'HitRate@5': 0.8210503566025502}





## Q5. Qdrant

Now let's evaluate the following settings in Qdrant:

- text = doc['question'] + ' ' + doc['text']
- model_handle = "jinaai/jina-embeddings-v2-small-en"
- limit = 5

In [27]:
from qdrant_client import QdrantClient, models

In [28]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [29]:
from fastembed import TextEmbedding

In [30]:
text = doc['question'] + ' ' + doc['text']
model_handle = "jinaai/jina-embeddings-v2-small-en"
EMBEDDING_DIMENSIONALITY=512
limit = 5

In [33]:
collection_name = "llmzoomcamp-evaluation-homework"

In [34]:
client.delete_collection(collection_name=collection_name)

True

In [35]:
# Define the collection name
collection_name = "llmzoomcamp-evaluation-homework"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [36]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [38]:
points = []
id = 0

for doc in documents:
    point = models.PointStruct(
        id=id,
        # Embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        vector=models.Document(
            text=doc['text'],
            model=model_handle
        ),
        # Save all needed metadata fields
        payload={
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            "id": doc['id']
        }
    )
    points.append(point)
    id += 1


In [39]:
# client.upsert(
#     collection_name=collection_name,
#     points=points
# )
from tqdm.auto import tqdm

BATCH_SIZE = 128  # adjust depending on your memory / API limits

for i in tqdm(range(0, len(points), BATCH_SIZE), desc="Upserting to Qdrant"):
    batch = points[i : i + BATCH_SIZE]
    client.upsert(
        collection_name=collection_name,
        points=batch
    )


Upserting to Qdrant:   0%|                                                                                                                                                      | 0/8 [00:00<?, ?it/s]
Fetching 5 files:   0%|                                                                                                                                                         | 0/5 [00:00<?, ?it/s][A
Fetching 5 files:  20%|█████████████████████████████                                                                                                                    | 1/5 [00:01<00:07,  1.77s/it][A
Fetching 5 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:10<00:00,  2.02s/it][A
Upserting to Qdrant: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [03:01<00:00, 22.73s/

In [40]:
minsearch_search("What if I submit homeworks late?", "data-engineering-zoomcamp")

[{'text': 'No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlder news:[source1] [source2]',
  'section': 'General course-related questions',
  'question': 'Homework - Are late submissions of homework allowed?',
  'course': 'data-engineering-zoomcamp',
  'id': 'be5bfee4'},
 {'text': 'when trying to:\nURL="spark://$HOSTNAME:7077"\nspark-submit \\\n--master="{$URL}" \\\n06_spark_sql.py \\\n--input_green=data/pq/green/2021/*/ \\\n--input_yellow=data/pq/yellow/2021/*/ \\\n--output=data/report-2021\nand you get errors like the following (SUMMARIZED):\nWARN Utils: Your hostname, <HOSTNAME> resolves to a loopback address..\nWARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address Setting default log level to "WARN".\nException in thread "main" org.apache.spark.SparkException: Master must either be yarn or start with spark, me

In [41]:
# Build an id -> question lookup once (if you have the documents list)
doc_question_by_id = {d["id"]: d.get("question") for d in documents}  # safe even if some don't have 'question'

def format_qdrant_results(resp, question_lookup=None):
    """
    Convert Qdrant QueryResponse -> list of dicts with desired fields.
    If question_lookup is provided (dict id->question), add 'question'.
    """
    out = []
    for sp in resp.points:
        payload = sp.payload or {}
        item = {
            "text": payload.get("text"),
            "section": payload.get("section"),
            "course": payload.get("course"),
            "id": payload.get("id") or sp.id,  # prefer payload id; fallback to numeric point id
        }
        if question_lookup is not None:
            item["question"] = question_lookup.get(item["id"])
        out.append(item)
    return out


In [42]:
def search_in_course(query, course="data-engineering-zoomcamp"):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return format_qdrant_results(results, question_lookup=doc_question_by_id)

In [43]:
search_in_course("What if I submit homeworks late?", "data-engineering-zoomcamp")

[{'text': 'No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlder news:[source1] [source2]',
  'section': 'General course-related questions',
  'course': 'data-engineering-zoomcamp',
  'id': 'be5bfee4',
  'question': 'Homework - Are late submissions of homework allowed?'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a',
  'question': 'Course - Can I still join the course after the start date?'},
 {'text': 'You will have two attempts for a project. If the first project deadline is over and you’re late or you submit the project and fail the first attemp

In [44]:
evaluate(ground_truth,search_in_course)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:36<00:00, 47.93it/s]


{'hit_rate': 0.8417981413442835, 'mrr': 0.7331028023917593}

## Q5. Cosine similarity

In [36]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [18]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [19]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [20]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [24]:
df_results.iloc[0]

answer_llm     You can sign up for the course by visiting the...
answer_orig    Machine Learning Zoomcamp FAQ\nThe purpose of ...
document                                                0227b872
question                     Where can I sign up for the course?
course                                 machine-learning-zoomcamp
Name: 0, dtype: object

This is how we calculate the Average Cosine Similarity between answers from LLMs and the orignal ansers:
- Transform the answer_llm and the answer_orig
- Normalize the vector, calculate the cosine similarity for each pair of answers
- Calculate the average Cosine Similarity

In [31]:
test = [1,2,3]
test_mean = sum(test) / len(test)
test_mean

2.0

In [40]:
similarity = []

for _,r in tqdm(df_results.iterrows()):
    u = pipeline.transform([r['answer_llm']]).ravel()
    v = pipeline.transform([r['answer_orig']]).ravel()

    cosine_result = cosine(u, v)
    similarity.append(cosine_result)

avg_similarity = np.mean(similarity)

1830it [00:02, 716.70it/s]


In [42]:
print(avg_similarity)

0.8415841233490402


## Q6. Rouge
And alternative way to see how two texts are similar is ROUGE.

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.

It can give a more nuanced view of text similarity than just cosine similarity alone.

We don't need to implement it ourselves, there's a python package for it:



In [43]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [44]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

There are three scores: rouge-1, rouge-2 and rouge-l, and precision, recall and F1 score for each.

rouge-1 - the overlap of unigrams,
rouge-2 - bigrams,
rouge-l - the longest common subsequence
For the 10th document, Rouge-1 F1 score is 0.45

Let's compute it for the pairs in the entire dataframe. What's the average Rouge-1 F1?

In [49]:
scores['rouge-1']['f']

0.45454544954545456

In [52]:
rouge_scores = []

for _,r in tqdm(df_results.iterrows()):
    score = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
    score = score['rouge-1']['f']
    rouge_scores.append(score)

rouge_avg_similarity = np.mean(rouge_scores)


  0%|                                                                                        | 0/1830 [01:50<?, ?it/s]

55it [00:00, 533.44it/s][A
109it [00:00, 517.21it/s][A
161it [00:00, 285.06it/s][A
198it [00:00, 297.39it/s][A
252it [00:00, 361.23it/s][A
294it [00:00, 371.58it/s][A
336it [00:00, 334.90it/s][A
373it [00:01, 333.40it/s][A
411it [00:01, 345.47it/s][A
448it [00:01, 339.77it/s][A
484it [00:01, 290.70it/s][A
515it [00:01, 272.82it/s][A
554it [00:01, 301.17it/s][A
621it [00:01, 396.27it/s][A
664it [00:01, 374.76it/s][A
704it [00:02, 322.14it/s][A
739it [00:02, 263.24it/s][A
772it [00:02, 275.81it/s][A
803it [00:02, 272.51it/s][A
834it [00:02, 280.68it/s][A
871it [00:02, 301.52it/s][A
920it [00:02, 347.96it/s][A
957it [00:02, 334.74it/s][A
997it [00:03, 349.38it/s][A
1033it [00:03, 333.47it/s][A
1067it [00:03, 283.31it/s][A
1103it [00:03, 301.73it/s][A
1141it [00:03, 321.48it/s][A
1175it [00:03, 296.26it/s][A
1206it [00:03, 279.45it/s][A
1238

In [54]:
print(rouge_avg_similarity)

0.3516946452113943
