In [193]:
import requests
import pandas as pd
from tqdm.auto import tqdm
import minsearch
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.metrics.pairwise import cosine_similarity

from qdrant_client import QdrantClient, models
from qdrant_client.http.exceptions import UnexpectedResponse
import numpy as np
from rouge import Rouge

In [2]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [35]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Q1. Minsearch text

In [12]:
text_fields = ["question", "section", "text"]
keyword_fields = ["course", "id"]

In [28]:
boost = {'question': 1.5, 'section': 0.1}

In [14]:
index = minsearch.Index(
    text_fields=text_fields,
    keyword_fields=keyword_fields,
)
index.fit(documents)

<minsearch.minsearch.Index at 0x1167a8490>

In [38]:

def search(query):
    results = index.search(
        query=query["question"],
        filter_dict={'course': query["course"]},
        boost_dict=boost,
        num_results=5
    )

    return results

In [39]:
evaluate(ground_truth, search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

In [25]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [92]:
# create embeddings for `question` field
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

## Q2. Vector search for question

In [93]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x1181fdd10>

In [94]:

def vector_search(query):
    query_embeddings = pipeline.transform([query["question"]])
    results = vindex.search(
        query_vector=query_embeddings,
        filter_dict={'course': query["course"]},
        num_results=5
    )

    return results

In [95]:
evaluate(ground_truth, vector_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

## Q3. Vector search for question and answer
We only used question in Q2. We can use both question and answer:

In [104]:
# create embeddings for `question` and `text` field
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc["text"]
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [107]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x11875bdd0>

In [108]:

def vector_search_with_doc_and_text(query):
    query_embeddings = pipeline.transform([query["question"]])
    results = vindex.search(
        query_vector=query_embeddings,
        filter_dict={'course': query["course"]},
        num_results=5
    )

    return results

In [109]:
evaluate(ground_truth, vector_search_with_doc_and_text)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

## Q4. Qdrant
Now let's evaluate the following settings in Qdrant:

text = doc['question'] + ' ' + doc['text']
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5


In [112]:
q_client = QdrantClient("http://localhost:6333")

### Create a Collection

In [113]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [114]:
collection_name = 'vector-search'
SIZE = 512
try:
    q_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=SIZE,
            distance=models.Distance.COSINE,
        )
    )
except UnexpectedResponse as err:
    pass

### Create, Embed & Insert Points into the Collection

In [118]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [122]:
points = []
for idx, course in enumerate(documents):
    point = models.PointStruct(
        id=idx,
        vector=models.Document(text=course['question'] + ' ' + course['text'], model=model_handle),
        payload={
            "text": course['text'],
            "section": course['section'],
            "course": course['course'],
            "id": course["id"],
        }
    )
    points.append(point)       

In [123]:
q_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [158]:
def qdrant_search(query, collection_name=collection_name, model_handle=model_handle, limit=5):

    results = q_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query["question"],
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    payload = [
        results.points[0].payload 
    ]

    return payload

In [159]:
evaluate(ground_truth, qdrant_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7620488437432462, 'mrr': 0.7620488437432462}

## Q5. Cosine simiarity

In [160]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [161]:
df_results.shape

(1830, 5)

In [162]:
df_results.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [163]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [168]:
df_results["v_llm"] = df_results["answer_llm"].apply(lambda query: pipeline.transform([query]))
df_results["v_orig"] = df_results["answer_orig"].apply(lambda query: pipeline.transform([query]))

In [169]:
df_results["v_llm"]

Unnamed: 0,answer_llm,answer_orig,document,question,course,v_llm,v_orig
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,"[[0.15549858795799837, 0.11219644369710731, -0...","[[0.22746772878326757, 0.12079641681716718, -0..."
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,"[[0.14894279479454864, 0.1767921364621181, -0....","[[0.22746772878326757, 0.12079641681716718, -0..."
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,"[[0.2624874045715802, 0.14431317946523875, -0....","[[0.22746772878326757, 0.12079641681716718, -0..."
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp,"[[0.20216096650756885, 0.08776325923283199, -0...","[[0.22746772878326757, 0.12079641681716718, -0..."
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp,"[[0.29736126396042195, -0.00020496794402426882...","[[0.22746772878326757, 0.12079641681716718, -0..."


In [176]:
np.vstack(df_results["v_llm"].values).shape

(1830, 128)

### Avg cosine similarity

In [191]:
np.mean([cosine_similarity(u, v)[0][0]
for u, v in zip(df_results["v_llm"].values, df_results["v_orig"].values)])

np.float64(0.8415841233490402)

## Q6. Rouge

In [194]:
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [211]:
%%time
scores = [
    rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]["rouge-1"]["f"]
for r in df_results.itertuples()
]

CPU times: user 4.24 s, sys: 20.4 ms, total: 4.26 s
Wall time: 4.27 s


In [209]:
sum(scores) / len(scores)

0.3516946452113944