# Homework: Search Evaluation

In [19]:
!pip uninstall minsearch -y
!pip install -U minsearch qdrant_client


Found existing installation: minsearch 0.0.4
Uninstalling minsearch-0.0.4:
  Successfully uninstalled minsearch-0.0.4
Collecting minsearch
  Using cached minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Using cached minsearch-0.0.4-py3-none-any.whl (11 kB)
Installing collected packages: minsearch
Successfully installed minsearch-0.0.4


### Evaluation data

In [20]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

Here, `documents` contains the documents from the FAQ database with unique IDs, and `ground_truth` contains generated question-answer pairs.

### we will need the following code for evaluating retrieval: 

In [21]:
from tqdm.auto import tqdm

# Measures if at least one relevant document is found in the top k results.
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

#  Average rank of the first relevant document across queries.
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Q1. Minsearch text

Now let's evaluate our usual minsearch approach, but tweak the parameters. Let's use the following boosting params:

```python
 boost = {'question': 1.5, 'section': 0.1} 
```

In [22]:
from minsearch import Index

boost = {'question': 1.5, 'section': 0.1}

# initalize our index
index = Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=[]
)
index.fit(documents) # making out document indexable 


<minsearch.minsearch.Index at 0x7f889fb16430>

In [23]:
print(f"Total queries: {len(ground_truth)}")
print("Example query:", ground_truth[0])



Total queries: 4627
Example query: {'question': 'When does the course begin?', 'course': 'data-engineering-zoomcamp', 'document': 'c02e79ef'}


In [24]:
# search function for a certain query

def search_function(q):
    return index.search(
        query=q['question'],
        filter_dict=None,
        boost_dict=boost,
        num_results=10
    )

### Now we will feed each question from `ground_truth` to our `search_function` (minsearch), then we will compare the result from the search to the ground_truth answers.

In [25]:
metrics = evaluate(ground_truth, search_function)
print(metrics)


  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8597363302355738, 'mrr': 0.6897542375497872}


# `Q1-Answer -> 0.85 and the closer answer is 0.84` 

### Embeddings

The latest version of minsearch also supports vector search. We will use it:


We will also use `TF-IDF (Term Frequency – Inverse Document Frequency)` and Singular Value Decomposition to create embeddings from texts.

#### What TF-IDF Does:

It looks at word appearance patterns across the documents.

It gives more weight to:

- Words that appear frequently in a specific document (high term frequency),

- But less frequently across all documents (high inverse document frequency).

In [26]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

#### Let's create embeddings for the "question" field:

In [27]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)
    
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), # Only keep words that appear in at least 3 questions (removes noise/rare words).
    
    # we use random_state for repeatable results (for testing, debugging, or sharing).
    TruncatedSVD(n_components=128, random_state=1) # 128 dimensions, and Hey computer, random_state use the same random choices every time
)

# Creates a reusable pipeline
X = pipeline.fit_transform(texts)

### Q2. Vector search for question

Now let's index these embeddings with minsearch:

In [28]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7f889fb16a00>

#### create the `search_function`

In [29]:

def search_function(q):
    query_vec = pipeline.transform([q['question']])
    return vindex.search(query_vec, filter_dict=None)

#### Now let's evaluate 

In [30]:
metrics = evaluate(ground_truth, search_function)
print(metrics)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.4696347525394424, 'mrr': 0.30031389257669755}


# `Q2-Answer -> mrr': 0.3, so close one is 0.35`

In [None]:
# Create the pipeline and fit it
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

Y = pipeline.fit_transform(texts)

# Create and fit the index
vindex_qa = VectorSearch(keyword_fields={'course'})
vindex_qa.fit(Y, documents)

# search
def search_function(q):
    query_vec = pipeline.transform([q['question']])
    return vindex_qa.search(query_vec, filter_dict=None)

# Now evaluate
metrics = evaluate(ground_truth, search_function)
print(metrics)


  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8415820185865571, 'mrr': 0.6254320739894556}


# `Q3-Answer -> hit_rat : 0.84, so close one is 0.82`

### Q4. Qdrant

Now let's evaluate the following settings in Qdrant:

text = doc['question'] + ' ' + doc['text']
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5


In [38]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer("jinaai/jina-embeddings-v2-small-en")

# Initialize client
client = QdrantClient(":memory:")

# Create the collection only if it doesn't already exist
if not client.collection_exists("qa_eval"):
    client.create_collection(
        collection_name="qa_eval",
        vectors_config=VectorParams(size=512, distance=Distance.COSINE)
    )

# Index documents
for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vec = model.encode(text).tolist()
    client.upsert(
        collection_name="qa_eval",
        points=[PointStruct(id=i, vector=vec, payload={"id": i})]
    )

# Define search function
def qdrant_search(doc, k=5):
    query = doc['question'] + ' ' + doc['text']
    qvec = model.encode(query).tolist()

    hits = client.search(
        collection_name="qa_eval",
        query_vector=qvec,
        limit=k
    )
    return [hit.payload['id'] for hit in hits]
# Evaluate
metrics = evaluate(ground_truth, search_function)
print(metrics)

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8415820185865571, 'mrr': 0.6254320739894556}


# `Q4-Answer -> mrr : 0.625 ~ 0.63, so close one from the choice is 0.65`

# Q5. Cosine simiarity

In the second part of the module, we looked at evaluating the entire RAG approach. In particular, we looked at comparing the answer generated by our system with the actual answer from the FAQ.

One of the ways of doing it is using the cosine similarity. Let's see how to calculate it.

Cosine similarity is a dot product between two normalized vectors. In geometrical sense, it's the cosine of the angle between the vectors. Look up "cosine similarity geometry" if you want to learn more about it.

For us, it means that we need two things:

- First, we normalize each of the vectors
- Then, compute the dot product

So, we get this:

```python
def cosine(u, v):
    u = normalize(u)
    v = normalize(v)
    return u.dot(v)
```
For normalization, we first compute the vector norm (its length), and then divide the vector by it:

```python
def normalize(u):
    norm = np.sqrt(u.dot(u))
    return u / norm
```
(where np is import numpy as np)

Or we can simplify it:

```python
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)
```
Now let's use this function to compute the A->Q->A cosine similarity.
We will use the results from our gpt-4o-mini evaluations:

```python
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)
```
When creating embeddings, we will use a simple way - the same we used in the Embeddings section:

```python
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
```

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

# Load CSV from correct raw GitHub URL
url_prefix = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/"
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

combined_text = pd.concat([
    df_results["answer_llm"],
    df_results["answer_orig"],
    df_results["question"]
])
pipeline.fit(combined_text)

similarities = []
for i in range(len(df_results)):
    
    v_llm = pipeline.transform([df_results.loc[i, "answer_llm"]])[0]
    v_orig = pipeline.transform([df_results.loc[i, "answer_orig"]])[0]
    
    sim = cosine(v_llm, v_orig)
    
    similarities.append(sim)

average_cosine = np.mean(similarities)
print(f"Average cosine similarity: {average_cosine:.2f}")


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Average cosine similarity: 0.75


# `Q5-Answer -> Average cosine similarity: 0.75`

# Q6. Rouge

And alternative way to see how two texts are similar is ROUGE.

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.

It can give a more nuanced view of text similarity than just cosine similarity alone.

We don't need to implement it ourselves, there's a python package for it:

```pip install rouge```

Let's compute the ROUGE score between the answers at the index 10 of our dataframe `(doc_id=5170565b)`


```python
    from rouge import Rouge
    rouge_scorer = Rouge()

    r = df_results.iloc[10]
    scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
    scores
```


There are three scores: `rouge-1`, `rouge-2` and `rouge-l`, and precision, recall and F1 score for each.

`rouge-1` - the overlap of unigrams,
`rouge-2` - bigrams,
`rouge-l` - the longest common subsequence

For the 10th document, Rouge-1 F1 score is 0.45

Let's compute it for the pairs in the entire dataframe. What's the average Rouge-1 F1?


In [2]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [3]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [5]:
from rouge import Rouge
import numpy as np
import pandas as pd

# Initialize Rouge scorer
rouge_scorer = Rouge()


# List to store Rouge-1 F1 scores
rouge1_f1_scores = []

for i in range(len(df_results)):
    llm_answer = df_results.loc[i, 'answer_llm']
    orig_answer = df_results.loc[i, 'answer_orig']
    
    # Compute rouge scores between the two texts
    scores = rouge_scorer.get_scores(llm_answer, orig_answer)[0]
    
    # Extract the Rouge-1 F1 score
    rouge1_f1 = scores['rouge-1']['f']
    rouge1_f1_scores.append(rouge1_f1)

# Calculate average Rouge-1 F1 score
average_rouge1_f1 = np.mean(rouge1_f1_scores)
print(f"Average Rouge-1 F1 score: {average_rouge1_f1:.2f}")


Average Rouge-1 F1 score: 0.35


# `Q5-Answer -> Average Rouge-1 F1 score: 0.35`

### done!!!
