In [2]:
!pip install -U minsearch qdrant_client


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


# Evaluation Data

In [3]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7c00853492e0>

In [6]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [7]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

### Ans.1 = 0.84

In [8]:
from minsearch import VectorSearch

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [10]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [11]:
# Build and fit index using your embeddings
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7c007a959160>

In [12]:
def vectorsearch_function(q):
    # Encode the query question with the SAME pipeline (no refit)
    query_vec = pipeline.transform([q['question']]).ravel()
    
    # Perform MinSearch retrieval (hybrid search)
    results = vindex.search(
        query_vector=query_vec,     # query embedding
        filter_dict={'course': q['course']},
        num_results=5,             # retrieve top5
    )
    return results


In [13]:
evaluate(ground_truth, vectorsearch_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}

### Ans. 2 = 0.35

In [14]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

In [15]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [16]:
# Build and fit index using your embeddings
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7c00799bfb90>

In [17]:
evaluate(ground_truth, vectorsearch_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}

### Ans. 3 = 0.82

## Qdrant

In [18]:
!python -m pip install -q "qdrant-client[fastembed]>=1.14.2"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [19]:
from qdrant_client import QdrantClient, models

In [20]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [21]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [22]:
import json

EMBEDDING_DIMENSIONALITY = 512

In [23]:
# Define the collection name
collection_name = "homework3"

# 1. Explicitly delete first
try:
    client.delete_collection(collection_name)
    print("üßπ Deleted old collection.")
except Exception as e:
    print("‚ö†Ô∏è Delete error (maybe it didn't exist):", e)

# 2. Recreate clean collection
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=512, distance=models.Distance.COSINE)
)
print("‚úÖ Recreated clean collection.")
print("Count after recreation:", client.count(collection_name=collection_name).count)

üßπ Deleted old collection.
‚úÖ Recreated clean collection.
Count after recreation: 0


  client.recreate_collection(


In [24]:
import uuid
points = []
for doc in documents:
    combined_text = doc["question"] + " " + doc["text"]
    point = models.PointStruct(
        id=str(uuid.uuid5(uuid.NAMESPACE_DNS, doc["id"])),   # ‚úÖ valid and repeatable UUID
        vector=models.Document(text=doc["text"], model=model_handle),
        payload={
            "document": doc["id"],        # original short ID for evaluation
            "question": doc["question"],
            "text": doc["text"],
            "course": doc.get("course", "")
        }
    )
    points.append(point)

# Upload all points to Qdrant (it will embed them using FastEmbed internally)
from tqdm.auto import tqdm

for i in tqdm(range(0, len(points), 100), desc="Uploading to Qdrant"):
    client.upsert(collection_name=collection_name, points=points[i:i+100])

print("‚úÖ Upload complete. Total points:", client.count(collection_name=collection_name).count)

Uploading to Qdrant:   0%|          | 0/10 [00:00<?, ?it/s]

‚úÖ Upload complete. Total points: 947


In [25]:
from fastembed import TextEmbedding
import numpy as np

fe = TextEmbedding(model_name=model_handle)

def qdrantsearch_function(q, limit=5):
    # Embed the question locally ‚Üí deterministic & reliable
    qvec = np.array([*fe.embed([q["question"]])][0], dtype=np.float32)
    hits = client.search(
        collection_name=collection_name,
        query_vector=qvec,
        limit=limit,
        with_payload=True
    )
    # return original doc ids from payload for evaluator
    return [{"id": h.payload["document"]} for h in hits]

In [26]:
metrics = evaluate(ground_truth, qdrantsearch_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

  hits = client.search(


In [27]:
print(metrics)

{'hit_rate': 0.8106764642316836, 'mrr': 0.6953065341113756}


## Cosine Similarity

In [31]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


In [37]:
results_url = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/rag_evaluation/data/results-gpt4o-mini.csv"
df_results = pd.read_csv(results_url)
df_results.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [38]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [39]:
pipeline.fit(df_results["answer_llm"] + " " + df_results["answer_orig"] + " " + df_results["question"])

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [40]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [41]:
similarities = []

for _, row in df_results.iterrows():
    v_llm = pipeline.transform([row["answer_llm"]])[0]
    v_orig = pipeline.transform([row["answer_orig"]])[0]
    sim = cosine(v_llm, v_orig)
    similarities.append(sim)

avg_cosine = np.mean(similarities)
print("Average cosine similarity:", round(avg_cosine, 2))

Average cosine similarity: 0.84


## Rouge

In [42]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [43]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [44]:
f1_scores = []

for _, row in df_results.iterrows():
    s = rouge_scorer.get_scores(row["answer_llm"], row["answer_orig"])[0]
    f1_scores.append(s["rouge-1"]["f"])

avg_rouge1_f1 = np.mean(f1_scores)
print("Average Rouge-1 F1:", round(avg_rouge1_f1, 2))

Average Rouge-1 F1: 0.35
